In [1]:
import yaml
import mysql.connector
import os
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import squarify
import geopandas as gpd
import matplotlib.patches as mpatches
import matplotlib.colors as mcolors
import folium

import scipy.stats as stats
from scipy.stats import ks_2samp
from scipy.stats import kstest, norm
from scipy.stats import levene
import statsmodels.api as sm
from statsmodels.tsa.stattools import grangercausalitytests
from scipy.stats import chi2_contingency

# Now you can use chi2_contingency function in your code


#from sklearn.preprocessing import LabelEncoder, StandardScaler
#from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler


import warnings
warnings.filterwarnings('ignore')
with warnings.catch_warnings():
    warnings.simplefilter("ignore")

In [2]:
# Load the YAML file
with open("D:\Telangana_Growth_Analysis\db_config.yaml", 'r') as file:
    db_config = yaml.safe_load(file)

# Connect to the database
connection = mysql.connector.connect(**db_config)
cursor = connection.cursor()

In [4]:
# SQL query to join 'fact_stamps' and 'dim_districts' on 'dist_code'

query = """
    select * from investments
    inner join districts on investments.dist_code = districts.dist_code
    """
# Use pandas to run the query and store the result in a DataFrame
investments = pd.read_sql(query, connection)

# Sort the DataFrame by the 'id' column in ascending order
investments.sort_values(by='id', ascending=True, inplace=True)
investments.reset_index(drop=True, inplace=True)


# Load the 'dim_date' table into a DataFrame
dim_date = pd.read_sql("SELECT * FROM dim_date", connection)

# Merge the 'vehicles' DataFrame with the 'dim_date' DataFrame on the 'month' column
investments = pd.merge(investments, dim_date, on='month', how='inner')
# Display the first few rows of the merged DataFrame

# Set Pandas display options to show all columns
pd.set_option('display.max_columns', None)
investments.head()

Unnamed: 0,id,dist_code,month,sector,investment_in_cr,number_of_employees,dist_code.1,district,mmm,quarter,fiscal_year
0,1,14_1,2019-04-01,Engineering,2.32,15,14_1,Mahabubnagar\r,Apr,Q1,2019
1,2,19_1,2019-04-01,Engineering,0.63,13,19_1,Adilabad\r,Apr,Q1,2019
2,3,20_3,2019-04-01,Wood and Leather,0.2,8,20_3,Rajanna Sircilla\r,Apr,Q1,2019
3,4,20_3,2019-04-01,Textiles,0.27,27,20_3,Rajanna Sircilla\r,Apr,Q1,2019
4,5,21_5,2019-04-01,Electrical and Electronic Products,0.12,5,21_5,Mahabubabad\r,Apr,Q1,2019


In [5]:
#The \r is a carriage return character. 
# it can remove it from the district column using the str.replace method provided by pandas.

investments['district'] = investments['district'].str.replace('\r', '')

# drop dis_code
investments.drop(columns=['dist_code','dist_code'], inplace=True)

investments.head()

Unnamed: 0,id,month,sector,investment_in_cr,number_of_employees,district,mmm,quarter,fiscal_year
0,1,2019-04-01,Engineering,2.32,15,Mahabubnagar,Apr,Q1,2019
1,2,2019-04-01,Engineering,0.63,13,Adilabad,Apr,Q1,2019
2,3,2019-04-01,Wood and Leather,0.2,8,Rajanna Sircilla,Apr,Q1,2019
3,4,2019-04-01,Textiles,0.27,27,Rajanna Sircilla,Apr,Q1,2019
4,5,2019-04-01,Electrical and Electronic Products,0.12,5,Mahabubabad,Apr,Q1,2019


In [7]:
investments.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5753 entries, 0 to 5752
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   5753 non-null   int64  
 1   month                5753 non-null   object 
 2   sector               5753 non-null   object 
 3   investment_in_cr     5753 non-null   float64
 4   number_of_employees  5753 non-null   int64  
 5   district             5753 non-null   object 
 6   mmm                  5753 non-null   object 
 7   quarter              5753 non-null   object 
 8   fiscal_year          5753 non-null   int64  
dtypes: float64(1), int64(3), object(5)
memory usage: 449.5+ KB


In [8]:
# change date to datetime type

investments['month'] = pd.to_datetime(investments['month'])

In [9]:
investments.describe()

Unnamed: 0,id,investment_in_cr,number_of_employees,fiscal_year
count,5753.0,5753.0,5753.0,5753.0
mean,2877.0,17.231975,117.344168,2020.519555
std,1660.892381,257.42715,1398.725287,1.109453
min,1.0,0.0,0.0,2019.0
25%,1439.0,0.22,7.0,2020.0
50%,2877.0,0.74,15.0,2021.0
75%,4315.0,3.12,40.0,2021.0
max,5753.0,17793.35,57000.0,2022.0


### Investment and Employee Analysis

#### Investment (in Cr):

- **Range:** The investments range from a minimum of 0 Cr to a substantial 17,793.35 Cr.
- **Central Tendency:** On average, entities have an investment of approximately 17.23 Cr.
- **Variability:** The standard deviation is 257.43, indicating a wide spread in the investment amounts across entities.
- **Distribution:** 
  - Half of the entities have investments less than or equal to 0.74 Cr.
  - 25% have investments of 0.22 Cr or less.
  - 75% have investments of 3.12 Cr or less.

#### Number of Employees:

- **Range:** Entities have employee counts ranging from 0 to a significant 57,000.
- **Central Tendency:** The average number of employees in these entities is around 117.34. However, this average is influenced by outliers, as the median (or the 50th percentile) is only 15.
- **Variability:** A high standard deviation of 1,398.73 suggests a significant variation in the number of employees across entities.
- **Distribution:**
  - 25% of entities have 7 employees or fewer.
  - 75% have 40 employees or fewer.
  
This suggests that a majority of entities in this dataset are relatively small in size.


In [10]:
# Load the GeoJSON data
map_file = (r"D:\Telangana_Growth_Analysis\Telangana_Shape_Files\telangana_district_map.json")
geo_data = gpd.read_file(map_file)

In [12]:
#%matplotlib inline
# Create the profile report
#profile = ProfileReport(investments, title="Profiling Report")

# Specify the path to save the report
#output_file_path = "D:\\Telangana_Growth_Analysis\\y_data\\profiling_report.html"

# Save the report to the specified path
#profile.to_file(output_file_path)

In [16]:
investments.columns

Index(['id', 'month', 'sector', 'investment_in_cr', 'number_of_employees',
       'district', 'mmm', 'quarter', 'fiscal_year'],
      dtype='object')

In [15]:
investments[investments['investment_in_cr'] == investments['investment_in_cr'].max()]

Unnamed: 0,id,month,sector,investment_in_cr,number_of_employees,district,mmm,quarter,fiscal_year
1177,1178,2020-02-01,"Real Estate,Industrial Parks and IT Buildings",17793.35,25419,Rangareddy,Feb,Q4,2019


In [17]:
investments[investments['number_of_employees'] == investments['number_of_employees'].max()]

Unnamed: 0,id,month,sector,investment_in_cr,number_of_employees,district,mmm,quarter,fiscal_year
1065,1066,2020-01-01,"Real Estate,Industrial Parks and IT Buildings",394.0,57000,Rangareddy,Jan,Q4,2019


In [18]:
investments.nlargest(5, 'investment_in_cr')

Unnamed: 0,id,month,sector,investment_in_cr,number_of_employees,district,mmm,quarter,fiscal_year
1177,1178,2020-02-01,"Real Estate,Industrial Parks and IT Buildings",17793.35,25419,Rangareddy,Feb,Q4,2019
939,940,2019-12-01,"Fertlizers Organic and Inorganic,Pesticides,In...",5254.28,450,Peddapalli,Dec,Q3,2019
4178,4179,2022-03-01,"Real Estate,Industrial Parks and IT Buildings",1862.73,144,Rangareddy,Mar,Q4,2021
5328,5329,2022-12-01,Plastic and Rubber,1556.07,307,Rangareddy,Dec,Q3,2022
5448,5449,2023-01-01,Plastic and Rubber,1553.0,1812,Sangareddy,Jan,Q4,2022


In [19]:
investments.nlargest(5, 'number_of_employees')

Unnamed: 0,id,month,sector,investment_in_cr,number_of_employees,district,mmm,quarter,fiscal_year
1065,1066,2020-01-01,"Real Estate,Industrial Parks and IT Buildings",394.0,57000,Rangareddy,Jan,Q4,2019
2628,2629,2021-02-01,"Real Estate,Industrial Parks and IT Buildings",0.0,40541,Rangareddy,Feb,Q4,2020
170,171,2019-05-01,"Real Estate,Industrial Parks and IT Buildings",847.82,40250,Rangareddy,May,Q1,2019
2697,2698,2021-03-01,"Real Estate,Industrial Parks and IT Buildings",0.0,31315,Rangareddy,Mar,Q4,2020
1681,1682,2020-07-01,"Real Estate,Industrial Parks and IT Buildings",149.0,27000,Rangareddy,Jul,Q2,2020


In [21]:
investments[(investments['investment_in_cr'] == 0) | (investments['number_of_employees'] == 0)]

Unnamed: 0,id,month,sector,investment_in_cr,number_of_employees,district,mmm,quarter,fiscal_year
8,9,2019-04-01,Engineering,0.03,0,Nirmal,Apr,Q1,2019
44,45,2019-04-01,Granite and Stone Crushing,0.00,5,Medchal_Malkajgiri,Apr,Q1,2019
45,46,2019-04-01,Pharmaceuticals and Chemicals,0.00,75,Yadadri Bhuvanagiri,Apr,Q1,2019
72,73,2019-04-01,Wood and Leather,0.00,12,Sangareddy,Apr,Q1,2019
74,75,2019-04-01,Plastic and Rubber,33.60,0,Sangareddy,Apr,Q1,2019
...,...,...,...,...,...,...,...,...,...
3974,3975,2022-01-01,Wood and Leather,0.00,8,Nirmal,Jan,Q4,2021
4017,4018,2022-01-01,"Cement, Cement & Concrete Products, Fly Ash Br...",0.00,0,Suryapet,Jan,Q4,2021
5004,5005,2022-09-01,Granite and Stone Crushing,0.00,104,Mahabubnagar,Sep,Q2,2022
5110,5111,2022-09-01,Paper and Printing,0.00,200,Sangareddy,Sep,Q2,2022


In [24]:
# Box plot
fig_box = go.Figure()

fig_box.add_trace(go.Box(y=investments['investment_in_cr'], name='Investment in Cr'))
fig_box.add_trace(go.Box(y=investments['number_of_employees'], name='Number of Employees'))
fig_box.update_layout(title_text="Box Plot of Investment and Number of Employees")

fig_box.show()

# Violin plot
fig_violin = go.Figure()

fig_violin.add_trace(go.Violin(y=investments['investment_in_cr'], box_visible=True, line_color='blue', name='Investment in Cr'))
fig_violin.add_trace(go.Violin(y=investments['number_of_employees'], box_visible=True, line_color='green', name='Number of Employees'))
fig_violin.update_layout(title_text="Violin Plot of Investment and Number of Employees")

fig_violin.show()

Assuming data is correct, there extreme outliers  

**Note on Outliers in the Data:**

The dataset contains some extreme outliers, particularly in the 'investment_in_cr' and 'number_of_employees' columns:

- The most extreme outlier in the 'investment_in_cr' column is a staggering 17,793.35 Cr.

- The next highest investment value is significantly lower, standing at 5,254 Cr.

- Interestingly, 75% of the data points have investments below the threshold of 3.15 Cr, highlighting the substantial variation in investment amounts.

- On the employee front, the dataset includes an entity with an extraordinarily high employee count of 57,000.

- While the average number of employees across entities is approximately 117.34, it's essential to note that this average is significantly influenced by these outliers. The median value, which provides a more robust representation, stands at just 15.

- The majority of entities, precisely 75%, have a workforce of 40 employees or fewer, underscoring the prevalence of relatively smaller-sized entities in the dataset.

In [26]:
investments['sector'].value_counts()

Food Processing                                                                1065
Engineering                                                                     692
Cement, Cement & Concrete Products, Fly Ash Bricks                              659
Agro based incl Cold Storages                                                   579
Granite and Stone Crushing                                                      432
Pharmaceuticals and Chemicals                                                   353
Plastic and Rubber                                                              328
Others                                                                          301
Beverages                                                                       274
Wood and Leather                                                                233
Textiles                                                                        209
Paper and Printing                                                          

In [29]:
# Group by month
grouped_data = investments.groupby('month').agg({
    'investment_in_cr': 'sum',
    'number_of_employees': 'sum'
}).reset_index()

# Function to plot individual columns with specified color
def plot_individual_column(column_name, title, color):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=grouped_data['month'], 
                             y=grouped_data[column_name], 
                             mode='lines+markers',
                             line=dict(color=color)))
    fig.update_layout(title=title,
                      xaxis_title='Month/Year',
                      yaxis_title='Value',
                      template="plotly_dark")
    fig.show()

# Plot for 'Investment in Cr' in blue
plot_individual_column('investment_in_cr', 'Monthly Trends for Investment in Cr', 'blue')

# Plot for 'Number of Employees' in green
plot_individual_column('number_of_employees', 'Monthly Trends for Number of Employees', 'green')

# Problem Statement 8
## List down the top 5 sectors that have witnessed the most significant investments in FY 2022.

In [60]:
# Filter the data for the fiscal year 2022
fy_2022 = investments[investments['fiscal_year'] == 2022]

#Group by the 'sector' column
sector_grouped = fy_2022.groupby('sector')

#Sum the 'investment_in_cr' column for each sector
sector_investments = sector_grouped['investment_in_cr'].sum()

In [61]:
# Convert the sector_investments series into a DataFrame and reset the index
sector_investments_df = sector_investments.reset_index()

# Sort the DataFrame based on the investment values (from highest to lowest)
sorted_sectors_df = sector_investments_df.sort_values(by='investment_in_cr', ascending=True).reset_index(drop=True)

# Plotting
fig = go.Figure(data=[
    go.Bar(name='Investment', y=sorted_sectors_df['sector'], x=sorted_sectors_df['investment_in_cr'], orientation='h')
])

# Define the title string
plot_title = "Top Sectors with the Most Significant Investments for FY 2022"

# ... [rest of your code remains unchanged]

# Update layout
fig.update_layout(
    title=plot_title,
    xaxis_title='Investment in Cr',
    yaxis_title='Sector',
    template="plotly_dark",
    height=800,
    legend=dict(y=-0.1, x=0.4, xanchor='center', orientation='h')
)

# Save the plot (modify the path as per your requirements)
plot_path = f"D:\\Telangana_Growth_Analysis\\notebooks\\plots\\ts_ipass\\8\\{plot_title}.html"
fig.write_html(plot_path)

print(f"Plot saved to {plot_path}")

fig.show()

Plot saved to D:\Telangana_Growth_Analysis\notebooks\plots\ts_ipass\8\Top Sectors with the Most Significant Investments for FY 2022.html


In [62]:
#Sort the results in descending order based on the summed investments
sorted_sectors = sector_investments.sort_values(ascending=False)

# 5. Select the top 5 sectors
top_5_sectors = sorted_sectors.head(5)

top_5_sectors

sector
Plastic and Rubber                               5855.62
Pharmaceuticals and Chemicals                    2181.66
Real Estate,Industrial Parks and IT Buildings    2127.30
Solar and Other Renewable Energy                 2052.98
Engineering                                      1877.53
Name: investment_in_cr, dtype: float64

In [63]:

# Plotting
fig = go.Figure(data=[
    go.Bar(name=sector, x=[sector], y=[top_5_sectors[sector]])#, marker_color=color)
    for sector, color in zip(top_5_sectors.index, colors)
])

# Update layout
fig.update_layout(
    title="Top 5 Sectors with the Most Significant Investments for FY 2022",
    xaxis_title='Sector',
    yaxis_title='Investment in Cr',
    template="plotly_dark",
    height=600,
    showlegend=False,
)

# Define the title string for saving the plot
plot_title = "Top_5_Sectors_with_Most_Significant_Investments_FY_2022"

# Save the plot to the specified directory
plot_path = f"D:\\Telangana_Growth_Analysis\\notebooks\\plots\\ts_ipass\\8\\{plot_title}.html"
fig.write_html(plot_path)

print(f"Plot saved to {plot_path}")

fig.show()

Plot saved to D:\Telangana_Growth_Analysis\notebooks\plots\ts_ipass\8\Top_5_Sectors_with_Most_Significant_Investments_FY_2022.html


In [75]:
# Group by sector and sum the number of employees for the top 5 sectors
employment_in_top_sectors = fy_2022[fy_2022['sector'].isin(top_5_sectors.index)].groupby('sector')['number_of_employees'].sum()

#Plotting
fig = go.Figure(data=[
    go.Bar(name='Number of Employees', x=employment_in_top_sectors.index, y=employment_in_top_sectors.values)
   
])

# Update layout
fig.update_layout(
    title="Number of Employees in the Top 5 Sectors for FY 2022",
    xaxis_title='Sector',
    yaxis_title='Number of Employees',
    template="plotly_dark",
    height=600,
    showlegend=False
)

# Define the title string for saving the plot
plot_title = "Number_of_Employees_in_Top_5_Sectors_FY_2022"

# Save the plot to the specified directory
plot_path = f"D:\\Telangana_Growth_Analysis\\notebooks\\plots\\ts_ipass\\8\\{plot_title}.html"
fig.write_html(plot_path)

print(f"Plot saved to {plot_path}")

fig.show()

Plot saved to D:\Telangana_Growth_Analysis\notebooks\plots\ts_ipass\8\Number_of_Employees_in_Top_5_Sectors_FY_2022.html


## Top 5 Investment Sectors in FY 2022

The top 5 sectors that witnessed the most significant investments in FY 2022 are:

1. Plastic and Rubber: 5855.62 crores
2. Pharmaceuticals and Chemicals: 2181.66 crores
3. Real Estate, Industrial Parks, and IT Buildings: 2127.30 crores
4. Solar and Other Renewable Energy: 2052.98 crores
5. Engineering: 1877.53 crores

### Reasons for High Investments

- **Plastic and Rubber:** The growth in this sector might be influenced by increased demand for packaging, especially in e-commerce, and the rapid expansion of the automobile industry which uses a lot of plastic and rubber components.

- **Pharmaceuticals and Chemicals:** The pharmaceutical industry has witnessed significant growth due to the global health crisis. The demand for medicines, research, and vaccine development might have attracted more investments.

- **Real Estate, Industrial Parks, and IT Buildings:** Urbanization, the growth of IT hubs, and favorable government policies to attract tech companies could be reasons for growth in this sector.

- **Solar and Other Renewable Energy:** With the global emphasis on sustainable energy solutions and reducing carbon emissions, there's a push towards renewable energy sources, especially solar.

- **Engineering:** The engineering sector's growth can be attributed to infrastructural development, urbanization, and technological advancements.

### Employment Numbers

The number of employees in the top 5 sectors for FY 2022 are:

1. Engineering: 12,025 employees
2. Pharmaceuticals and Chemicals: 13,591 employees
3. Plastic and Rubber: 9,078 employees
4. Real Estate, Industrial Parks, and IT Buildings: 6,424 employees
5. Solar and Other Renewable Energy: 1,866 employees


## Summary

- The top 5 sectors with the most significant investments in FY 2022.
1. Plastic and Rubber
2. Pharmaceuticals and Chemicals
3. Real Estate, Industrial Parks, and IT Buildings
4. Solar and Other Renewable Energy
5. Engineering
- Possible reasons for high investments in these sectors include the growth of the e-commerce and automobile industries, global health crises, urbanization, technological advancements, and a global push towards sustainable energy.


## Recommendations

1. Deep Dive into Sector-Specific Dynamics: While we provided some general reasons for investments, a deeper dive into each sector's specific market dynamics would provide a clearer picture.
2. Focus on Renewable Energy: With a growing emphasis on sustainable energy, the state could further promote and facilitate growth in the renewable energy sector.
3. Skill Development Programs: Given the weak correlation between investment and employment, it might be beneficial to launch skill development programs tailored to the needs of these top sectors. This would ensure that investments also lead to job creation.
4. Infrastructure Development: Focusing on developing infrastructure can support the growth of sectors like Real Estate and Engineering further.
5. Promote Research and Development: Especially in sectors like Pharmaceuticals and Chemicals, promoting R&D can lead to more innovations and attract further investments.
