# IMPORTING DATA AND LIBRARIES

In [None]:
#importing related packages 

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 
from sklearn.metrics import confusion_matrix as cm
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import roc_auc_score,roc_curve
pd.options.mode.chained_assignment = None 

#scale up max iter

max_iter = 10000000

#uploaing data 

df_train = pd.read_csv("aug_train.csv")

## DATA OVERVIEW

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
def null_status(df):
    num_na = df.isna().sum()
    num_full = len(df) - num_na

    num_na = pd.DataFrame(num_na).reset_index().rename(columns = {'index':"feature",0:"num_na"})
    num_full = pd.DataFrame(num_full).reset_index().rename(columns = {'index':"feature",0:"num_full"})

    df_status = num_full.merge(num_na, on='feature', how='left')

    bar_chart = df_status.plot.bar(x='feature', stacked = "true",rot=90)

In [None]:
null_status(df_train)

In [None]:
#check whether there are high correlation btw features or not
corr = df_train.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)


## DATA PREPROCESSING

In [None]:
# unique values for categorical variables. 

print("Following features are initially categorical. ")
for column in df_train:
    col_type = df_train[column].dtypes
    if col_type == "object":
        num_unique = df_train[column].unique().shape[0]
        val_unique = df_train[column].unique()
        num_nan = df_train[df_train[column].isna()].shape[0]
        print(f"{column} column has {num_unique} unique values and {num_nan} NULL values.")
        print(val_unique)

In [None]:
# A function that will change some variables and deal with NA
# We didn't do anything about city, but there are more than 100 different values, it shouldnt be categorical. 

def data_process(df):
    for column in df:
        if column == "gender":
            df["gender"] = df["gender"].fillna("Not stated")
        elif column == "enrolled_university":
            df["enrolled_university"] = df["enrolled_university"].fillna("Not stated")
            #TO DO: wrt education level, this might be refreshed. 
        elif column == "education_level":
            df["education_level"] = df["education_level"].fillna("Not stated")
        elif column == "major_discipline":
            df["major_discipline"] = df["major_discipline"].fillna("Not stated")
        elif column == "experience":
            for i in range(0,len(df)):
                if df["experience"].loc[i] == ">20":
                    df["experience"].loc[i]="21"
                elif df["experience"].loc[i] == "<1":
                    df["experience"].loc[i] = "0"
            df["experience"] = df["experience"].fillna(0)
            df["experience"] = df['experience'].astype('int')
        elif column == "company_size":
            for i in range(0,len(df)):
                if df["company_size"].loc[i] == "<10":
                    df["company_size"].loc[i]="1-9"
                elif df["company_size"].loc[i] == "10/49":
                    df["company_size"].loc[i] = "10-49"
                elif df["company_size"].loc[i] == "10000+":
                    df["company_size"].loc[i] = "10001-20000"
            df_train["company_size"].fillna(df_train["company_size"].mode()[0], inplace=True)
        elif column == "company_type":
            df["company_type"] = df["company_type"].fillna("Not stated")
        elif column == "last_new_job":
            for i in range(0,len(df)):
                if df["last_new_job"].loc[i] == ">4":
                    df["last_new_job"].loc[i]="5"
                elif df["last_new_job"].loc[i] == "never":
                    df["last_new_job"].loc[i]="0"
            df["last_new_job"] = df["last_new_job"].astype("float")
            df["last_new_job"].fillna(df["last_new_job"].mean(), inplace=True)
            df["last_new_job"] = df["last_new_job"].astype("int")
                            

In [None]:
data_process(df_train)

## OPTION 1 - LOGISTIC REGRESSION

In [None]:
# Since there are many categorical variables, we should encode them in order to use in LR. I will use one hot encoding.

df_train = pd.concat([df_train, pd.get_dummies(df_train.select_dtypes(include='object'))], axis=1)
to_be_dropped = df_train.select_dtypes(include='object').columns
df_train = df_train.drop(to_be_dropped, axis=1)

In [None]:
#Let's seperate our target and variables

X = df_train.drop("target",axis=1)
y = pd.DataFrame(df_train["target"])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True, stratify = y)

In [None]:
logistic_model = LogisticRegression(solver='lbfgs', max_iter = 10000)
logistic_model.fit(X_train,y_train.values.ravel())

y_pred = logistic_model.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred))

confusion_matrix(y_test,y_pred)