# Approach
- Basic Data Analysis
- Exploratory Data Analysis
    - Univariate, Bivariate, Multivariate Analysis
    - Target vs Features
- Data Preprocessing
- Statistical Tests
    - Chisquare, Anova Test
- Modeling
- Assumptions of Linear Regression

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats

# from sklearn.decomposition import PCA
# from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
# from sklearn.model_selection import train_test_split, RandomizedSearchCV
# from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, roc_curve, precision_recall_curve, auc

# from xgboost import XGBClassifier
bold_start = '\033[1m'
bold_end = '\033[0m'

In [2]:
raw_data = pd.read_csv(r"F:\Muthu_2023\Personal\NextStep\DSCourse\Scaler\Business-Case-Study\Scaler\Dataset\scaler_clustering.csv")
raw_data.head()

Unnamed: 0.1,Unnamed: 0,company_hash,email_hash,orgyear,ctc,job_position,ctc_updated_year
0,0,atrgxnnt xzaxv,6de0a4417d18ab14334c3f43397fc13b30c35149d70c05...,2016.0,1100000,Other,2020.0
1,1,qtrxvzwt xzegwgbb rxbxnta,b0aaf1ac138b53cb6e039ba2c3d6604a250d02d5145c10...,2018.0,449999,FullStack Engineer,2019.0
2,2,ojzwnvwnxw vx,4860c670bcd48fb96c02a4b0ae3608ae6fdd98176112e9...,2015.0,2000000,Backend Engineer,2020.0
3,3,ngpgutaxv,effdede7a2e7c2af664c8a31d9346385016128d66bbc58...,2017.0,700000,Backend Engineer,2019.0
4,4,qxen sqghu,6ff54e709262f55cb999a1c1db8436cb2055d8f79ab520...,2017.0,1400000,FullStack Engineer,2019.0


# Data Analysis

In [4]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205843 entries, 0 to 205842
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        205843 non-null  int64  
 1   company_hash      205799 non-null  object 
 2   email_hash        205843 non-null  object 
 3   orgyear           205757 non-null  float64
 4   ctc               205843 non-null  int64  
 5   job_position      153281 non-null  object 
 6   ctc_updated_year  205843 non-null  float64
dtypes: float64(2), int64(2), object(3)
memory usage: 11.0+ MB


In [5]:
raw_data.describe()

Unnamed: 0.1,Unnamed: 0,orgyear,ctc,ctc_updated_year
count,205843.0,205757.0,205843.0,205843.0
mean,103273.941786,2014.88275,2271685.0,2019.628231
std,59741.306484,63.571115,11800910.0,1.325104
min,0.0,0.0,2.0,2015.0
25%,51518.5,2013.0,530000.0,2019.0
50%,103151.0,2016.0,950000.0,2020.0
75%,154992.5,2018.0,1700000.0,2021.0
max,206922.0,20165.0,1000150000.0,2021.0


In [6]:
raw_data.describe(include='object')

Unnamed: 0,company_hash,email_hash,job_position
count,205799,205843,153281
unique,37299,153443,1017
top,nvnv wgzohrnvzwj otqcxwto,bbace3cc586400bbc65765bc6a16b77d8913836cfc98b7...,Backend Engineer
freq,8337,10,43554


## Unique Values

In [7]:
raw_data.nunique()

Unnamed: 0          205843
company_hash         37299
email_hash          153443
orgyear                 77
ctc                   3360
job_position          1017
ctc_updated_year         7
dtype: int64

## Null Values

In [12]:
print(bold_start + "Percentage of Null values by columns" + bold_end)
(raw_data.isnull().sum()/len(raw_data))*100

[1mPercentage of Null values by columns[0m


Unnamed: 0           0.000000
company_hash         0.021376
email_hash           0.000000
orgyear              0.041779
ctc                  0.000000
job_position        25.534995
ctc_updated_year     0.000000
dtype: float64

#### Inference:
- Dimensionally small dataset
- Personal details are given as hash values
- Some errors in org year column, as the min value is 0 and no. of unique values is 77

## Transform Hash

In [13]:
df = raw_data.copy()

In [17]:
company_hash_dict = dict(zip(df['company_hash'].unique(), np.arange(df['company_hash'].nunique())))
df['company_hash'] = df['company_hash'].map(company_hash_dict)

In [22]:
email_hash_dict = dict(zip(df['email_hash'].unique(), np.arange(df['email_hash'].nunique())))
df['email_hash'] = df['email_hash'].map(email_hash_dict)

## Remove Unnamed column

In [24]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
df.head()

# Exploratory Data Analysis
- Univariate Analysis
    - Categorical: *Frequency and Proportion Plots*
    - Numerical: *Distribution and Box plots*
    - *Skewness and Kurtosis*
- Bivariate Analysis
- Multivariate Analysis

In [25]:
cat_features = list(df.columns).remove('ctc')
num_features = ['ctc']

In [None]:
def univariate_categoryplot(df, feature):
    plt.figure(figsize=(12,4))   
    if df[feature].nunique() < 10:
        # Categorical Feature 
        plt.suptitle('Univariate Analysis of ' + feature + ": Frequency and Proportion")
        plt.subplot(1,2,1)
        sns.countplot(data=df, x=feature) 
        plt.subplot(1,2,2)
        labels = df[feature].value_counts().index
        plt.pie(x=list(df[feature].value_counts()), labels=list(labels), autopct="%0.2f")        
        plt.show()
    else:
        # Numerical Feature
        temp = df[feature].value_counts()[:10]
        plt.suptitle('Univariate Analysis of ' + feature + ": Frequency and Proportion")
        plt.subplot(1,2,1)
        sns.countplot(data=df, x=feature) 
        plt.subplot(1,2,2)
        labels = df[feature].value_counts().index
        plt.pie(x=list(df[feature].value_counts()), labels=list(labels), autopct="%0.2f")        
        plt.show()

## Orgyear

In [42]:
np.set_printoptions(suppress=True)
print(df['orgyear'].unique())

[ 2016.  2018.  2015.  2017.  2019.  2020.  2012.  2013.  2003.  2006.
  2014.  2011.  2021.  2008.  2004.  2022.  2009.  2005.  2010.  2007.
  2000.  2002.  2023.  2001.  1981.  2031.    nan  2024.  1996.  1999.
  2106.  1997.  1994.  1995.  1992.  1973.  1991.  1998.  1990.  1993.
  1988.  2025.  2029.     0.   208.  1985.   209.   206.  1982.  2026.
  1970.  2101.  1972.  2107.  1986.  1989.    91.  1987.     3.  2027.
     2.  1976.     4.     5.  1971.  1977.  1984.    83.     1.  1979.
  2028.  2204.    38.  1900.   201.     6. 20165.   200.]


In [40]:
df['orgyear'] = pd.to_datetime(df['orgyear'].apply(lambda x: x if x < 2025 and x>1900 else np.nan), format='%Y').dt.year

173

#### Inference:
- Many typo errors in the year column such as 0, 1, 2, 209, 91 etc.,
- Transformed such typo errors into nan values
- Later all nan values will be imputed

## CTC Updated Year

In [43]:
np.set_printoptions(suppress=True)
print(df['ctc_updated_year'].unique())

[2020. 2019. 2021. 2017. 2016. 2015. 2018.]


In [40]:
df['orgyear'] = pd.to_datetime(df['orgyear'].apply(lambda x: x if x < 2025 and x>1900 else np.nan), format='%Y').dt.year

173

#### Inference:
- Many typo errors in the year column such as 0, 1, 2, 209, 91 etc.,
- Transformed such typo errors into nan values
- Later all nan values will be imputed

In [45]:
sorted(df['job_position'].unique())

TypeError: '<' not supported between instances of 'float' and 'str'

In [49]:
sorted(df['job_position'].dropna().unique())

[' SDE 2',
 '.',
 '..',
 '.7',
 '7',
 '7033771951',
 '737',
 '857628',
 '896651',
 '91',
 'A Group Chat Application',
 'ABAP Developer',
 'ASE',
 'ASSISTANT ENGINEER TRAINEE',
 'ASSISTANT PROFESSOR ',
 'ASSISTANT SYSTEM ENGINEER',
 'ATM Interface System',
 'Account',
 'Administrative clerk',
 'Administrator',
 'Advisor',
 'Advisory Consultant - UI/UX Expert',
 'Advisory Software Engineer',
 'Advisory System Analyat',
 'Agency collection manager',
 'Ai engineer',
 'Analyst / Software Engineer',
 'Analyst consultant',
 'Analyst programmer',
 'Analytics and Insite Devloper',
 'Android Application Developer',
 'Android Application developer',
 'Android Developer Sr Executive',
 'Android Engineer',
 'Android Facilitator',
 'Android Lead',
 'Any technical',
 'App Developer',
 'Application Developer (Frontend)',
 'Application Developer 1',
 'Application Developer 2',
 'Application Developer Analyst',
 'Application Development Analyst',
 'Application Development Associate',
 'Application Devel

In [None]:
#### Inference:
- Invalid occupation
- Case sensitive differences
- Trim the text and remove double spaces
- create categories for occupation