In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df = pd.read_csv("edu_enrollees.csv")
df.head(2)

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevant_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target,Xgrp
0,8949.0,city_103,0.92,Male,Has relevant experience,no_enrollment,Graduate,STEM,>20,,,1,36.0,1.0,train
1,29725.0,city_40,0.776,Male,No relevant experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47.0,0.0,train


### 전처리

In [3]:
df = df.drop(columns = ["city", "company_size", "company_type"])

In [4]:
df.dtypes

enrollee_id               float64
city_development_index    float64
gender                     object
relevant_experience        object
enrolled_university        object
education_level            object
major_discipline           object
experience                 object
last_new_job               object
training_hours            float64
target                    float64
Xgrp                       object
dtype: object

In [5]:
df.isna().sum()

enrollee_id                  0
city_development_index       0
gender                    4508
relevant_experience          0
enrolled_university        386
education_level            460
major_discipline          2813
experience                  65
last_new_job               423
training_hours               0
target                       0
Xgrp                         0
dtype: int64

In [6]:
df = df.dropna()
df.isna().sum()

enrollee_id               0
city_development_index    0
gender                    0
relevant_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
last_new_job              0
training_hours            0
target                    0
Xgrp                      0
dtype: int64

In [7]:
df["experience"].unique(), df["last_new_job"].unique()

(array(['>20', '15', '13', '7', '5', '16', '4', '11', '<1', '18', '19',
        '12', '10', '9', '2', '6', '14', '3', '8', '20', '17', '1'],
       dtype=object),
 array(['1', '>4', '4', '3', '2', 'never'], dtype=object))

In [8]:
# df = df.loc[df["experience"] != ">20", ]
df = df.loc[~df["experience"].isin([">20", "<1"]), ]
df = df.loc[~df["last_new_job"].isin([">4", "never"]), ]

In [9]:
df["experience"] = df["experience"].astype("int")
df["last_new_job"] = df["last_new_job"].astype("int")

In [11]:
len(df)

7522

In [12]:
df_base = df.reset_index(drop = True)

### Q1.

In [14]:
df_q1 = df_base[["relevant_experience", "target"]].copy()
df_q1.head(2)

Unnamed: 0,relevant_experience,target
0,Has relevant experience,1.0
1,Has relevant experience,0.0


In [15]:
df_q1_sub1 = df_q1.loc[df_q1["relevant_experience"] == "Has relevant experience", ]
df_q1_sub2 = df_q1.loc[df_q1["relevant_experience"] != "Has relevant experience", ]

In [16]:
df_q1["relevant_experience"].unique()

array(['Has relevant experience', 'No relevant experience'], dtype=object)

In [17]:
df_q1_subA = df_q1.loc[df_q1["relevant_experience"] == "No relevant experience", ]
df_q1_subB = df_q1.loc[df_q1["relevant_experience"] == "Has relevant experience", ]

In [25]:
# df_q1_subA["target"].value_counts()
# df_q1_subA["target"].value_counts(normalize = True)
df_q1_subA["target"].value_counts(normalize = True)[1]

0.38287331917905165

In [23]:
df_q1_subB["target"].value_counts(normalize = True)[1]

0.21591095105581928

In [26]:
stat_A = df_q1_subA["target"].value_counts(normalize = True)[1]
stat_B = df_q1_subB["target"].value_counts(normalize = True)[1]
round(stat_A / stat_B, 2)

1.77

In [27]:
df_q1.groupby("relevant_experience")["target"].mean()

relevant_experience
Has relevant experience    0.215911
No relevant experience     0.382873
Name: target, dtype: float64

### Q2.

In [28]:
df_base.columns

Index(['enrollee_id', 'city_development_index', 'gender',
       'relevant_experience', 'enrolled_university', 'education_level',
       'major_discipline', 'experience', 'last_new_job', 'training_hours',
       'target', 'Xgrp'],
      dtype='object')

In [29]:
df_q2_cat = df_base.loc[:, "gender":"major_discipline"]
df_q2_cat.head(1)

Unnamed: 0,gender,relevant_experience,enrolled_university,education_level,major_discipline
0,Male,Has relevant experience,no_enrollment,Graduate,STEM


In [31]:
df_q2_dum = pd.get_dummies(df_q2_cat, columns = df_q2_cat.columns)
df_q2_dum.head(1)

Unnamed: 0,gender_Female,gender_Male,gender_Other,relevant_experience_Has relevant experience,relevant_experience_No relevant experience,enrolled_university_Full time course,enrolled_university_Part time course,enrolled_university_no_enrollment,education_level_Graduate,education_level_Masters,education_level_Phd,major_discipline_Arts,major_discipline_Business Degree,major_discipline_Humanities,major_discipline_No Major,major_discipline_Other,major_discipline_STEM
0,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1


In [36]:
drop_cols = ["gender_Other", "relevant_experience_No relevant experience",
             "enrolled_university_no_enrollment", "education_level_Phd",
             "major_discipline_STEM"]
df_q2_dum2 = df_q2_dum.drop(columns = drop_cols)
df_q2_dum2.head(2)

Unnamed: 0,gender_Female,gender_Male,relevant_experience_Has relevant experience,enrolled_university_Full time course,enrolled_university_Part time course,education_level_Graduate,education_level_Masters,major_discipline_Arts,major_discipline_Business Degree,major_discipline_Humanities,major_discipline_No Major,major_discipline_Other
0,0,1,1,0,0,1,0,0,0,0,0,0
1,0,1,1,0,0,1,0,0,0,0,0,0


In [39]:
set(range(17)) - {2, 4, 7, 10, 16}

{0, 1, 3, 5, 6, 8, 9, 11, 12, 13, 14, 15}

In [41]:
df_q2_dum2 = df_q2_dum.iloc[:, list(set(range(17)) - {2, 4, 7, 10, 16})]
df_q2_dum2.head(2)

Unnamed: 0,gender_Female,gender_Male,relevant_experience_Has relevant experience,enrolled_university_Full time course,enrolled_university_Part time course,education_level_Graduate,education_level_Masters,major_discipline_Arts,major_discipline_Business Degree,major_discipline_Humanities,major_discipline_No Major,major_discipline_Other
0,0,1,1,0,0,1,0,0,0,0,0,0
1,0,1,1,0,0,1,0,0,0,0,0,0


In [35]:
df_q2_dum.columns.to_series().reset_index(drop = True).reset_index()
# 2, 

Unnamed: 0,index,0
0,0,gender_Female
1,1,gender_Male
2,2,gender_Other
3,3,relevant_experience_Has relevant experience
4,4,relevant_experience_No relevant experience
5,5,enrolled_university_Full time course
6,6,enrolled_university_Part time course
7,7,enrolled_university_no_enrollment
8,8,education_level_Graduate
9,9,education_level_Masters


In [42]:
df_base.columns

Index(['enrollee_id', 'city_development_index', 'gender',
       'relevant_experience', 'enrolled_university', 'education_level',
       'major_discipline', 'experience', 'last_new_job', 'training_hours',
       'target', 'Xgrp'],
      dtype='object')

In [55]:
df_job2 = pd.concat([df_base[["target", "Xgrp",
                              "city_development_index", "experience",
                              "last_new_job", "training_hours"]], 
                     df_q2_dum2], 
                    axis = 1)
df_job2 = df_job2.reset_index(drop = True)
df_job2.head(2)

Unnamed: 0,target,Xgrp,city_development_index,experience,last_new_job,training_hours,gender_Female,gender_Male,relevant_experience_Has relevant experience,enrolled_university_Full time course,enrolled_university_Part time course,education_level_Graduate,education_level_Masters,major_discipline_Arts,major_discipline_Business Degree,major_discipline_Humanities,major_discipline_No Major,major_discipline_Other
0,1.0,train,0.92,7,1,46.0,0,1,1,0,0,1,0,0,0,0,0,0
1,0.0,train,0.92,5,1,108.0,0,1,1,0,0,1,0,0,0,0,0,0


In [56]:
model_lr = LogisticRegression(C = 100000, max_iter = 1000, solver = "liblinear",
                              random_state = 123)
model_lr.fit(X = df_job2.drop(columns = ["target", "Xgrp"]),
             y = df_job2["target"])

LogisticRegression(C=100000, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=123, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [57]:
np.exp(1)

2.718281828459045

In [58]:
np.exp(model_lr.coef_.max())

1.6717183500321011

최신버전 1.6721493496611732

In [52]:
pd.DataFrame(np.exp(model_lr.coef_), 
             columns = df_job2.columns[2:])

Unnamed: 0,city_development_index,experience,last_new_job,training_hours,gender_Female,gender_Male,gender_Other,relevant_experience_Has relevant experience,relevant_experience_No relevant experience,enrolled_university_Full time course,...,enrolled_university_no_enrollment,education_level_Graduate,education_level_Masters,education_level_Phd,major_discipline_Arts,major_discipline_Business Degree,major_discipline_Humanities,major_discipline_No Major,major_discipline_Other,major_discipline_STEM
0,0.002125,0.971921,1.099645,0.999068,1.578935,1.630293,1.893209,1.503626,3.241073,2.627656,...,1.570094,2.051334,1.514926,1.568197,1.568192,1.32318,1.5026,1.758624,0.754837,1.177446


In [54]:
import sklearn
sklearn.__version__

'0.21.3'

In [None]:
# '1.2.2'

### Q3.

In [59]:
df_job2["Xgrp"].unique()

array(['train', 'test'], dtype=object)

In [61]:
df_train = df_job2.loc[df_job2["Xgrp"] == "train", ]
df_test  = df_job2.loc[df_job2["Xgrp"] == "test",  ]
df_train = df_train.drop(columns = "Xgrp").reset_index(drop = True)
df_test  = df_test.drop(columns =  "Xgrp").reset_index(drop = True)

In [62]:
len(df_train), len(df_test)

(4706, 2816)

In [66]:
model_knn = KNeighborsClassifier(n_neighbors = 5)
model_knn.fit(X = df_train.drop(columns = "target"), 
              y = df_train["target"])
pred = model_knn.predict(df_test.drop(columns = "target"))

In [67]:
pred[:3]

array([0., 0., 0.])

In [68]:
pd.crosstab(df_test["target"], pred)

col_0,0.0,1.0
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,1901,191
1.0,615,109


In [72]:
df_cross = pd.crosstab(df_test["target"], pred)
df_cross.values.diagonal().sum() / df_cross.sum().sum()

0.7137784090909091

In [73]:
from sklearn.metrics import accuracy_score

In [74]:
accuracy_score(y_true = df_test["target"], y_pred = pred)

0.7137784090909091