In [29]:
import yaml
import mysql.connector
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import squarify
import geopandas as gpd
import matplotlib.patches as mpatches
import matplotlib.colors as mcolors
import folium

import scipy.stats as stats
from scipy.stats import ks_2samp
from scipy.stats import kstest, norm
from scipy.stats import levene
import statsmodels.api as sm
from statsmodels.tsa.stattools import grangercausalitytests

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.layers import LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error
from keras.optimizers import Adam

import warnings
warnings.filterwarnings('ignore')
with warnings.catch_warnings():
    warnings.simplefilter("ignore")

In [2]:
# Load the YAML file
with open("D:\Telangana_Growth_Analysis\db_config.yaml", 'r') as file:
    db_config = yaml.safe_load(file)

# Connect to the database
connection = mysql.connector.connect(**db_config)
cursor = connection.cursor()

In [3]:
# SQL query to join 'fact_stamps' and 'dim_districts' on 'dist_code'

query = """
    select * from documents
    inner join districts on documents.dist_code = districts.dist_code
    """
# Use pandas to run the query and store the result in a DataFrame
stamps = pd.read_sql(query, connection)

# Sort the DataFrame by the 'id' column in ascending order
stamps.sort_values(by='id', ascending=True, inplace=True)
stamps.reset_index(drop=True, inplace=True)


# Load the 'dim_date' table into a DataFrame
dim_date = pd.read_sql("SELECT * FROM dim_date", connection)

# Merge the 'stamps' DataFrame with the 'dim_date' DataFrame on the 'month' column
stamps = pd.merge(stamps, dim_date, on='month', how='inner')
# Display the first few rows of the merged DataFrame
stamps.head()

Unnamed: 0,id,dist_code,month,documents_registered_cnt,documents_registered_rev,estamps_challans_cnt,estamps_challans_rev,dist_code.1,district,mmm,quarter,fiscal_year
0,1,14_1,2019-04-01,4533,59236363.0,0,0.0,14_1,Mahabubnagar\r,Apr,Q1,2019
1,2,17_3,2019-04-01,4151,41508762.0,0,0.0,17_3,Siddipet\r,Apr,Q1,2019
2,3,20_3,2019-04-01,2116,23674170.0,0,0.0,20_3,Rajanna Sircilla\r,Apr,Q1,2019
3,4,21_5,2019-04-01,1089,15915285.0,0,0.0,21_5,Mahabubabad\r,Apr,Q1,2019
4,5,23_1,2019-04-01,6133,82593256.0,0,0.0,23_1,Nalgonda\r,Apr,Q1,2019


In [4]:
# SQL query to join 'fact_stamps' and 'dim_districts' on 'dist_code'

query = """
    select * from vehicles
    inner join districts on vehicles.dist_code = districts.dist_code
    """
# Use pandas to run the query and store the result in a DataFrame
vehicles = pd.read_sql(query, connection)

# Sort the DataFrame by the 'id' column in ascending order
vehicles.sort_values(by='id', ascending=True, inplace=True)
vehicles.reset_index(drop=True, inplace=True)


# Load the 'dim_date' table into a DataFrame
dim_date = pd.read_sql("SELECT * FROM dim_date", connection)

# Merge the 'vehicles' DataFrame with the 'dim_date' DataFrame on the 'month' column
vehicles = pd.merge(vehicles, dim_date, on='month', how='inner')
# Display the first few rows of the merged DataFrame

# Set Pandas display options to show all columns
pd.set_option('display.max_columns', None)
vehicles.head()

Unnamed: 0,id,dist_code,month,fuel_type_petrol,fuel_type_diesel,fuel_type_electric,fuel_type_others,vehicleClass_MotorCycle,vehicleClass_MotorCar,vehicleClass_AutoRickshaw,vehicleClass_Agriculture,vehicleClass_others,seatCapacity_1_to_3,seatCapacity_4_to_6,seatCapacity_above_6,brand_new_vehicles,pre_owned_vehicles,category_non_transport,category_transport,dist_code.1,district,mmm,quarter,fiscal_year
0,1,15_1,2019-04-01,17910,3011,76,22,15308,4429,0,4,1278,16110,4182,717,19542,1477,19856,1163,15_1,Rangareddy\r,Apr,Q1,2019
1,2,18_2,2019-04-01,3066,306,6,0,2995,142,49,64,128,3156,189,33,3322,56,3203,175,18_2,Kamareddy\r,Apr,Q1,2019
2,3,20_3,2019-04-01,1577,215,0,0,1546,79,29,21,117,1683,104,5,1751,41,1648,144,20_3,Rajanna Sircilla\r,Apr,Q1,2019
3,4,21_3,2019-04-01,1961,281,2,0,1939,72,72,48,113,2082,146,16,2209,35,2075,169,21_3,Jangoan\r,Apr,Q1,2019
4,5,21_7,2019-04-01,1552,309,0,0,1512,76,69,109,95,1696,145,20,1820,41,1701,160,21_7,Jayashankar Bhupalpally\r,Apr,Q1,2019


In [5]:
# SQL query to join 'fact_stamps' and 'dim_districts' on 'dist_code'

query = """
    select * from investments
    inner join districts on investments.dist_code = districts.dist_code
    """
# Use pandas to run the query and store the result in a DataFrame
investments = pd.read_sql(query, connection)

# Sort the DataFrame by the 'id' column in ascending order
investments.sort_values(by='id', ascending=True, inplace=True)
investments.reset_index(drop=True, inplace=True)


# Load the 'dim_date' table into a DataFrame
dim_date = pd.read_sql("SELECT * FROM dim_date", connection)

# Merge the 'vehicles' DataFrame with the 'dim_date' DataFrame on the 'month' column
investments = pd.merge(investments, dim_date, on='month', how='inner')
# Display the first few rows of the merged DataFrame

# Set Pandas display options to show all columns
pd.set_option('display.max_columns', None)
investments.head()

Unnamed: 0,id,dist_code,month,sector,investment_in_cr,number_of_employees,dist_code.1,district,mmm,quarter,fiscal_year
0,1,14_1,2019-04-01,Engineering,2.32,15,14_1,Mahabubnagar\r,Apr,Q1,2019
1,2,19_1,2019-04-01,Engineering,0.63,13,19_1,Adilabad\r,Apr,Q1,2019
2,3,20_3,2019-04-01,Wood and Leather,0.2,8,20_3,Rajanna Sircilla\r,Apr,Q1,2019
3,4,20_3,2019-04-01,Textiles,0.27,27,20_3,Rajanna Sircilla\r,Apr,Q1,2019
4,5,21_5,2019-04-01,Electrical and Electronic Products,0.12,5,21_5,Mahabubabad\r,Apr,Q1,2019


In [6]:
#The \r is a carriage return character. 
# it can remove it from the district column using the str.replace method provided by pandas.

stamps['district'] = stamps['district'].str.replace('\r', '')

# drop dis_code
stamps.drop(columns=['dist_code','dist_code'], inplace=True)

vehicles['district'] = vehicles['district'].str.replace('\r', '')

# drop dis_code
vehicles.drop(columns=['dist_code','dist_code'], inplace=True)

investments['district'] = investments['district'].str.replace('\r', '')

# drop dis_code
investments.drop(columns=['dist_code','dist_code'], inplace=True)

In [10]:
# change date to datetime type

stamps['month'] = pd.to_datetime(stamps['month'])

vehicles['month'] = pd.to_datetime(vehicles['month'])

investments['month'] = pd.to_datetime(investments['month'])

## Revenue Forecasting: Use time series forecasting models like ARIMA, Prophet, or LSTM to predict future revenue from document registration for each district.



In [11]:
stamps.columns

Index(['id', 'month', 'documents_registered_cnt', 'documents_registered_rev',
       'estamps_challans_cnt', 'estamps_challans_rev', 'district', 'mmm',
       'quarter', 'fiscal_year'],
      dtype='object')

In [13]:
grouped_data = stamps.groupby('month').agg({
    'documents_registered_cnt': 'sum',
    'documents_registered_rev': 'sum',
    'estamps_challans_cnt': 'sum',
    'estamps_challans_rev': 'sum'
}).reset_index()

# Function to plot individual columns with specified color
def plot_individual_column(column_name, title, color):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=grouped_data['month'], 
                             y=grouped_data[column_name], 
                             mode='lines+markers',
                             line=dict(color=color)))
    fig.update_layout(title=title,
                      xaxis_title='Month/Year',
                      yaxis_title='Value',
                      template="plotly_dark")
    fig.show()

# Plot for 'Documents Registered Count' in blue
plot_individual_column('documents_registered_cnt', 'Monthly Trends for Documents Registered Count', 'blue')

# Plot for 'Documents Registered Revenue' in green
plot_individual_column('documents_registered_rev', 'Monthly Trends for Documents Registered Revenue', 'green')

# Plot for 'E-stamps Challans Count' in red
plot_individual_column('estamps_challans_cnt', 'Monthly Trends for E-stamps Challans Count', 'red')

# Plot for 'E-stamps Challans Revenue' in purple
plot_individual_column('estamps_challans_rev', 'Monthly Trends for E-stamps Challans Revenue', 'purple')


Prophet:

In [23]:
# Group by month and aggregate the revenue by sum (you can also use mean or other aggregation functions as needed)
stamps['month'] = pd.to_datetime(stamps['month']).dt.to_period('M')  # Convert to period to group by month-year
grouped = stamps.groupby('month')['documents_registered_rev'].sum().reset_index()

# Convert month back to datetime
grouped['month'] = grouped['month'].dt.to_timestamp()

In [24]:
grouped.head()

Unnamed: 0,month,documents_registered_rev
0,2019-04-01,5293494000.0
1,2019-05-01,5690714000.0
2,2019-06-01,5335687000.0
3,2019-07-01,5133927000.0
4,2019-08-01,5526628000.0


In [32]:
# Scaling and splitting the data
scaler = MinMaxScaler(feature_range=(0, 1))
grouped['scaled_revenue'] = scaler.fit_transform(grouped[['documents_registered_rev']])
train = grouped[:int(0.7*len(grouped))]
test = grouped[int(0.7*len(grouped)):]

# Creating dataset for LSTM
def create_dataset(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X.iloc[i:(i+time_steps)].values)
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs), np.array(ys)

TIME_STEPS = 1
X_train, y_train = create_dataset(train[['scaled_revenue']], train['scaled_revenue'], TIME_STEPS)
X_test, y_test = create_dataset(test[['scaled_revenue']], test['scaled_revenue'], TIME_STEPS)

# Building the LSTM model
model = Sequential([
    LSTM(10, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    Dropout(0.2),
    LSTM(10),
    Dense(1)
])

# Compiling the model
optimizer = Adam(lr=0.005)
model.compile(optimizer=optimizer, loss='mean_squared_error')

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Training the model with validation split and early stopping
model.fit(
    X_train, y_train, 
    epochs=50, 
    batch_size=5, 
    verbose=1, 
    validation_split=0.2,  # 20% of the data will be used for validation
    callbacks=[early_stopping]  # using early stopping
)



Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50


<keras.src.callbacks.History at 0x2dd097a0cd0>

In [33]:
# Predicting
predictions = model.predict(X_test)

# Inverse transforming the predictions
predictions = scaler.inverse_transform(predictions)
y_test_inv = scaler.inverse_transform(y_test.reshape(-1, 1))

# Plotting with Plotly
trace1 = go.Scatter(
    x=test['month'][TIME_STEPS:],
    y=y_test_inv.flatten(),
    mode='lines',
    name='True'
)

trace2 = go.Scatter(
    x=test['month'][TIME_STEPS:],
    y=predictions.flatten(),
    mode='lines',
    name='Predicted'
)

layout = go.Layout(
    title='Documents Registered Revenue Prediction',
    xaxis=dict(title='Month'),
    yaxis=dict(title='Revenue'),
    template="plotly_dark"
)

fig = go.Figure(data=[trace1, trace2], layout=layout)
fig.show()



### Incomplete Predictive Analysis Note:
Due to constraints in time and overlapping commitments, the predictive analysis segment of our study is yet to be concluded. We acknowledge this as a limitation in the current phase of our research and are committed to addressing this gap in the subsequent stages of our work.

### Future Work:

#### Stamp Registration:

- **Revenue Forecasting:**
    - **Objective:** Predict future revenue from document registration for each district.
    - **Approach:** Employ time series forecasting models like ARIMA, Prophet, or LSTM.
    - **Application:** Aid in budget planning and resource allocation.

- **Challan Prediction:**
    - **Objective:** Forecast the likelihood of e-Stamp challan issuance.
    - **Approach:** Utilize machine learning models trained on historical data.
    - **Application:** Enhance resource allocation efficiency.

- **Anomaly Detection:**
    - **Objective:** Spot unusual spikes or drops in document registration or e-stamp revenue.
    - **Approach:** Implement algorithms like Isolation Forest or One-Class SVM.
    - **Application:** Early detection of anomalies for preventive measures.

#### Transportation:

- **Vehicle Sales Forecast:**
    - **Objective:** Project future vehicle sales per district.
    - **Approach:** Apply time series forecasting.
    - **Application:** Inform inventory management and marketing strategies.

- **Vehicle Preference Prediction:**
    - **Objective:** Determine the likely vehicle class preference in districts.
    - **Approach:** Use classification algorithms on historical trends.
    - **Application:** Tailor vehicle supply and marketing to district preferences.

- **Fuel Type Forecast:**
    - **Objective:** Estimate future vehicle sales by fuel type.
    - **Approach:** Time series forecasting models.
    - **Application:** Guide the development of fuel-specific infrastructure.

#### Ts-Ipass:

- **Investment Forecasting:**
    - **Objective:** Anticipate sectors attracting the most investments.
    - **Approach:** Utilize time series models.
    - **Application:** Inform sector-specific development strategies.

- **District Investment Prediction:**
    - **Objective:** Predict districts likely to attract significant future investments.
    - **Approach:** Machine learning models on historical data.
    - **Application:** Direct investment promotion efforts.

- **Sectoral Seasonality:**
    - **Objective:** Identify seasonal investment patterns in specific sectors.
    - **Approach:** Time series decomposition.
    - **Application:** Optimize timing for investment drives.

#### General Predictive Analytics Ideas:

- **District Profiling:**
    - **Objective:** Segment districts for targeted policy-making.
    -
