In [1]:
import yaml
import mysql.connector
import os
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import squarify
import geopandas as gpd
import matplotlib.patches as mpatches
import matplotlib.colors as mcolors
import folium

import scipy.stats as stats
from scipy.stats import ks_2samp
from scipy.stats import kstest, norm
from scipy.stats import levene
import statsmodels.api as sm
from statsmodels.tsa.stattools import grangercausalitytests
from scipy.stats import chi2_contingency

# Now you can use chi2_contingency function in your code


#from sklearn.preprocessing import LabelEncoder, StandardScaler
#from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler


import warnings
warnings.filterwarnings('ignore')
with warnings.catch_warnings():
    warnings.simplefilter("ignore")

In [2]:
# Load the YAML file
with open("D:\Telangana_Growth_Analysis\db_config.yaml", 'r') as file:
    db_config = yaml.safe_load(file)

# Connect to the database
connection = mysql.connector.connect(**db_config)
cursor = connection.cursor()

In [4]:
# SQL query to join 'fact_stamps' and 'dim_districts' on 'dist_code'

query = """
    select * from investments
    inner join districts on investments.dist_code = districts.dist_code
    """
# Use pandas to run the query and store the result in a DataFrame
investments = pd.read_sql(query, connection)

# Sort the DataFrame by the 'id' column in ascending order
investments.sort_values(by='id', ascending=True, inplace=True)
investments.reset_index(drop=True, inplace=True)


# Load the 'dim_date' table into a DataFrame
dim_date = pd.read_sql("SELECT * FROM dim_date", connection)

# Merge the 'vehicles' DataFrame with the 'dim_date' DataFrame on the 'month' column
investments = pd.merge(investments, dim_date, on='month', how='inner')
# Display the first few rows of the merged DataFrame

# Set Pandas display options to show all columns
pd.set_option('display.max_columns', None)
investments.head()

Unnamed: 0,id,dist_code,month,sector,investment_in_cr,number_of_employees,dist_code.1,district,mmm,quarter,fiscal_year
0,1,14_1,2019-04-01,Engineering,2.32,15,14_1,Mahabubnagar\r,Apr,Q1,2019
1,2,19_1,2019-04-01,Engineering,0.63,13,19_1,Adilabad\r,Apr,Q1,2019
2,3,20_3,2019-04-01,Wood and Leather,0.2,8,20_3,Rajanna Sircilla\r,Apr,Q1,2019
3,4,20_3,2019-04-01,Textiles,0.27,27,20_3,Rajanna Sircilla\r,Apr,Q1,2019
4,5,21_5,2019-04-01,Electrical and Electronic Products,0.12,5,21_5,Mahabubabad\r,Apr,Q1,2019


In [5]:
#The \r is a carriage return character. 
# it can remove it from the district column using the str.replace method provided by pandas.

investments['district'] = investments['district'].str.replace('\r', '')

# drop dis_code
investments.drop(columns=['dist_code','dist_code'], inplace=True)

investments.head()

Unnamed: 0,id,month,sector,investment_in_cr,number_of_employees,district,mmm,quarter,fiscal_year
0,1,2019-04-01,Engineering,2.32,15,Mahabubnagar,Apr,Q1,2019
1,2,2019-04-01,Engineering,0.63,13,Adilabad,Apr,Q1,2019
2,3,2019-04-01,Wood and Leather,0.2,8,Rajanna Sircilla,Apr,Q1,2019
3,4,2019-04-01,Textiles,0.27,27,Rajanna Sircilla,Apr,Q1,2019
4,5,2019-04-01,Electrical and Electronic Products,0.12,5,Mahabubabad,Apr,Q1,2019


In [7]:
investments.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5753 entries, 0 to 5752
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   5753 non-null   int64  
 1   month                5753 non-null   object 
 2   sector               5753 non-null   object 
 3   investment_in_cr     5753 non-null   float64
 4   number_of_employees  5753 non-null   int64  
 5   district             5753 non-null   object 
 6   mmm                  5753 non-null   object 
 7   quarter              5753 non-null   object 
 8   fiscal_year          5753 non-null   int64  
dtypes: float64(1), int64(3), object(5)
memory usage: 449.5+ KB


In [8]:
# change date to datetime type

investments['month'] = pd.to_datetime(investments['month'])

In [9]:
investments.describe()

Unnamed: 0,id,investment_in_cr,number_of_employees,fiscal_year
count,5753.0,5753.0,5753.0,5753.0
mean,2877.0,17.231975,117.344168,2020.519555
std,1660.892381,257.42715,1398.725287,1.109453
min,1.0,0.0,0.0,2019.0
25%,1439.0,0.22,7.0,2020.0
50%,2877.0,0.74,15.0,2021.0
75%,4315.0,3.12,40.0,2021.0
max,5753.0,17793.35,57000.0,2022.0


### Investment and Employee Analysis

#### Investment (in Cr):

- **Range:** The investments range from a minimum of 0 Cr to a substantial 17,793.35 Cr.
- **Central Tendency:** On average, entities have an investment of approximately 17.23 Cr.
- **Variability:** The standard deviation is 257.43, indicating a wide spread in the investment amounts across entities.
- **Distribution:** 
  - Half of the entities have investments less than or equal to 0.74 Cr.
  - 25% have investments of 0.22 Cr or less.
  - 75% have investments of 3.12 Cr or less.

#### Number of Employees:

- **Range:** Entities have employee counts ranging from 0 to a significant 57,000.
- **Central Tendency:** The average number of employees in these entities is around 117.34. However, this average is influenced by outliers, as the median (or the 50th percentile) is only 15.
- **Variability:** A high standard deviation of 1,398.73 suggests a significant variation in the number of employees across entities.
- **Distribution:**
  - 25% of entities have 7 employees or fewer.
  - 75% have 40 employees or fewer.
  
This suggests that a majority of entities in this dataset are relatively small in size.


In [10]:
# Load the GeoJSON data
map_file = (r"D:\Telangana_Growth_Analysis\Telangana_Shape_Files\telangana_district_map.json")
geo_data = gpd.read_file(map_file)

In [12]:
#%matplotlib inline
# Create the profile report
#profile = ProfileReport(investments, title="Profiling Report")

# Specify the path to save the report
#output_file_path = "D:\\Telangana_Growth_Analysis\\y_data\\profiling_report.html"

# Save the report to the specified path
#profile.to_file(output_file_path)

In [16]:
investments.columns

Index(['id', 'month', 'sector', 'investment_in_cr', 'number_of_employees',
       'district', 'mmm', 'quarter', 'fiscal_year'],
      dtype='object')

In [15]:
investments[investments['investment_in_cr'] == investments['investment_in_cr'].max()]

Unnamed: 0,id,month,sector,investment_in_cr,number_of_employees,district,mmm,quarter,fiscal_year
1177,1178,2020-02-01,"Real Estate,Industrial Parks and IT Buildings",17793.35,25419,Rangareddy,Feb,Q4,2019


In [17]:
investments[investments['number_of_employees'] == investments['number_of_employees'].max()]

Unnamed: 0,id,month,sector,investment_in_cr,number_of_employees,district,mmm,quarter,fiscal_year
1065,1066,2020-01-01,"Real Estate,Industrial Parks and IT Buildings",394.0,57000,Rangareddy,Jan,Q4,2019


In [18]:
investments.nlargest(5, 'investment_in_cr')

Unnamed: 0,id,month,sector,investment_in_cr,number_of_employees,district,mmm,quarter,fiscal_year
1177,1178,2020-02-01,"Real Estate,Industrial Parks and IT Buildings",17793.35,25419,Rangareddy,Feb,Q4,2019
939,940,2019-12-01,"Fertlizers Organic and Inorganic,Pesticides,In...",5254.28,450,Peddapalli,Dec,Q3,2019
4178,4179,2022-03-01,"Real Estate,Industrial Parks and IT Buildings",1862.73,144,Rangareddy,Mar,Q4,2021
5328,5329,2022-12-01,Plastic and Rubber,1556.07,307,Rangareddy,Dec,Q3,2022
5448,5449,2023-01-01,Plastic and Rubber,1553.0,1812,Sangareddy,Jan,Q4,2022


In [19]:
investments.nlargest(5, 'number_of_employees')

Unnamed: 0,id,month,sector,investment_in_cr,number_of_employees,district,mmm,quarter,fiscal_year
1065,1066,2020-01-01,"Real Estate,Industrial Parks and IT Buildings",394.0,57000,Rangareddy,Jan,Q4,2019
2628,2629,2021-02-01,"Real Estate,Industrial Parks and IT Buildings",0.0,40541,Rangareddy,Feb,Q4,2020
170,171,2019-05-01,"Real Estate,Industrial Parks and IT Buildings",847.82,40250,Rangareddy,May,Q1,2019
2697,2698,2021-03-01,"Real Estate,Industrial Parks and IT Buildings",0.0,31315,Rangareddy,Mar,Q4,2020
1681,1682,2020-07-01,"Real Estate,Industrial Parks and IT Buildings",149.0,27000,Rangareddy,Jul,Q2,2020
