**UTILITY FUNCTIONS AND LIBRARIES***

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import r2_score

***DATASET LOADING***

In [None]:
df = pd.read_csv("../dataset/layoffs_data.csv")

***DATASET PRELIMINARY ANALYSIS***

In [None]:
df

In [None]:
df.shape

In [None]:
df.info()

Therefore we have null/unknown values in 'Laid_Off_count', 'Percentage', 'Funds_Raised', 'List_of_Employees_Laid_Off'

'List_of_Employees_Laid_Off' can be removed as it isnt signficant due to all major values being 'Unknown' and other files being links to spreadsheets. The race, position and other characteristics should have been taken into account but wasnt possible here.

'Source' can be dropped as well due to links.

In [None]:
df['List_of_Employees_Laid_Off'].value_counts()

In [None]:
df_analysis = ProfileReport(df)
df_analysis

In [None]:
def drop_cols(df,cols):
    for i in cols:
        del df[i]
    return df

In [None]:
df = drop_cols(df, ['Source','List_of_Employees_Laid_Off', 'Date_Added'])

In [None]:
df.isna().any()

In [None]:
df.isin(['Unknown']).any()

'Laid_Off_Count', 'Percentage', 'Funds_Raised' have missing values

'Industry', 'Stage',  have Unknown Values

9 features

Categorical : 'Company' , 'Location' , 'Industry', 'Date', 'Stage', 'Country'

Numeric : 'Laid_Off_Count', 'Percentage', 'Funds_Raised' 

In [None]:
cat_ex = ['Company', 'Location', 'Country']
round_ex = ['Laid_Off_Count', 'Funds_Raised', 'Industry', 'Stage']
cat_fill_ex = ['Industry', 'Stage']
num_fill_ex = [ 'Percentage','Laid_Off_Count','Funds_Raised']
num_cols = ['Laid_Off_Count', 'Percentage', 'Funds_Raised']

In [None]:
def cat2num(df,cols):
    oe = OrdinalEncoder()
    for i in cols:
        df[i] = oe.fit_transform(df[[i]])
        df[i] = df[i].astype('int')
    return df

In [None]:
# def impute(df,cols):
#     my_imputer = SimpleImputer()
#     for i in cols:
#         df[i] = my_imputer.fit_transform(df[[i]])
#     return df

In [None]:
def round2(df,cols):
    for i in cols:
        df[i] = df[i].astype(int)
    return df

In [None]:
def date2convert(df,col):
    df[col] = pd.to_datetime(df[col])
    df['Date_month'] = df[col].dt.month
    df['Date_day'] = df[col].dt.day
    del df[col]
    return df

In [None]:
def cat2fill(df,cols):
    df[cols] = df[cols].apply(lambda series: pd.Series(LabelEncoder().fit_transform(series[series.notnull()].index)))
    imp_cat = IterativeImputer(estimator=RandomForestClassifier(), initial_strategy='most_frequent')
    df[cols] = imp_cat.fit_transform(df[cols])
    return df

In [None]:
def num2fill(df,cols):
    imp_num = IterativeImputer(estimator=RandomForestRegressor(),
                               initial_strategy='mean',
                               max_iter=10, random_state=0)
    df[cols] = imp_num.fit_transform(df[cols])
    return df

In [None]:
df = pd.read_csv("/kaggle/input/layoffs-data-2022/layoffs_data.csv")
df = drop_cols(df, ['Source','List_of_Employees_Laid_Off', 'Date_Added'])
df = df.replace('Unknown', np.nan)
df = num2fill(df,num_fill_ex)
df = cat2num(df,cat_ex)
df = date2convert(df, 'Date')
df = cat2fill(df,cat_fill_ex)
df = round2(df, round_ex)
df

In [None]:
df.isnull().sum()

In [None]:
df.info()

***APPLYING ML MODELS & HYPERPARAMETER TUNING***

In [None]:
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])
df = pd.DataFrame(df)
y = df['Industry'].values
X = df.drop('Industry',axis=1).values
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [None]:
lm = LinearRegression()
lm.fit(X_train, y_train)

rfe = RFE(lm, n_features_to_select=8)             
rfe = rfe.fit(X_train, y_train)
list(zip(df.columns,rfe.support_,rfe.ranking_))


In [None]:
y_pred = rfe.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(r2)

In [None]:
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)
scores = cross_val_score(lm, X_train, y_train, scoring='r2', cv=folds)
scores

In [None]:
scores = cross_val_score(lm, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
scores

In [None]:
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)
hyper_params = [{'n_features_to_select': list(range(1, 10))}]
lm = LinearRegression()
lm.fit(X_train, y_train)
rfe = RFE(lm)             
model_cv = GridSearchCV(estimator = rfe, 
                        param_grid = hyper_params, 
                        scoring= 'r2', 
                        cv = folds, 
                        verbose = 1,
                        return_train_score=True)   
model_cv.fit(X_train, y_train)


In [None]:
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results

In [None]:
plt.figure(figsize=(12,5))
plt.plot(cv_results["param_n_features_to_select"], cv_results["mean_test_score"], linewidth=2)
plt.plot(cv_results["param_n_features_to_select"], cv_results["mean_train_score"], color='red', linewidth=2)
plt.xlabel('number of features')
plt.ylabel('r-squared')
plt.title("Optimal Number of Features")
plt.legend(['test score', 'train score'], loc='upper left')