In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn import metrics 
from sklearn.metrics import confusion_matrix

# Load data

In [3]:
occupation = pd.read_csv('occupation_data.csv')

In [8]:
occupation.isna().sum()

pid                    0
occ_choices            0
sex                    0
bioage                 0
country_origin         0
migback                0
fsedu              43817
msedu              34246
locchildh          25334
fprofedu           43414
mprofedu           41611
fprofstat          76441
mprofstat         172747
dtype: int64

In [7]:
# Turn occupation, locchildh into dummy
occupation_dm = pd.get_dummies(occupation, columns=['country_origin','locchildh'],drop_first=True,dummy_na=True)

In [9]:
migconditions = [occupation_dm['migback']==1,
              occupation_dm['migback']==3]
migchoices = [0, 1]
occupation_dm['migration'] =  np.select(migconditions,migchoices,default=np.nan)

In [7]:
occupation_dm['male'] = np.where(occupation_dm['sex'] == 1, 1, 0)

In [8]:
occupation_dm.isna().sum()

pid                                   0
occ_choices                           0
sex                                   0
bioage                                0
migback                               0
fsedu                             43817
msedu                             34246
fprofedu                          43414
mprofedu                          41611
fprofstat                         76441
mprofstat                        172747
country_origin_Eastern Europe         0
country_origin_Former Yugo            0
country_origin_Germany                0
country_origin_Greece                 0
country_origin_Italy                  0
country_origin_Middle East            0
country_origin_Other European         0
country_origin_Others                 0
country_origin_Poland                 0
country_origin_Turkey                 0
country_origin_nan                    0
locchildh_2.0                         0
locchildh_3.0                         0
locchildh_4.0                         0


In [9]:
occupation_dm = occupation_dm.rename(columns={"country_origin_Eastern Europe": "Eastern_Europe",
                                             "country_origin_Former Yugo": "Former_Yugo",
                                              "country_origin_Germany": "Germany",
                                              "country_origin_Greece":"Greece",
                                              "country_origin_Italy":"Italy",
                                              "country_origin_Middle East":"Middle_East",
                                              "country_origin_Others":"Others",
                                              "country_origin_Poland":"Poland",
                                              "country_origin_Turkey":"Turkey",
                                              "locchildh_2.0": "medium_city",
                                              "locchildh_3.0": "small_city",
                                              "locchildh_4.0": "countryside"
                                             })

### Basic model

$$ Occupation = Sex + bioage + country + migback*country + locchild$$

In [10]:
basic = occupation_dm[occupation_dm['locchildh_nan']!=1]

In [11]:
basic.columns

Index(['pid', 'occ_choices', 'sex', 'bioage', 'migback', 'fsedu', 'msedu',
       'fprofedu', 'mprofedu', 'fprofstat', 'mprofstat', 'Eastern_Europe',
       'Former_Yugo', 'Germany', 'Greece', 'Italy', 'Middle_East',
       'country_origin_Other European', 'Others', 'Poland', 'Turkey',
       'country_origin_nan', 'medium_city', 'small_city', 'countryside',
       'locchildh_nan', 'migration', 'male'],
      dtype='object')

In [12]:
basic = basic[['occ_choices','male','bioage',"Turkey","Eastern_Europe","Former_Yugo","Germany","Greece","Italy","Middle_East","Others","Poland","medium_city","small_city","countryside",'migration']]

In [13]:
basic['mig_german'] = basic['migration'] * basic['Germany']

In [14]:
basic['occ_choices'].value_counts()

white_collar          153146
blue_collar            70889
unemployed             64273
training_schooling     40078
self_employment        26691
civil_servant          22140
military                1637
Name: occ_choices, dtype: int64

In [15]:
basic['male'].value_counts()

0    198726
1    180128
Name: male, dtype: int64

In [16]:
X = basic.drop(['occ_choices','migration'], axis=1) 
y = basic['occ_choices']
X.columns

Index(['male', 'bioage', 'Turkey', 'Eastern_Europe', 'Former_Yugo', 'Germany',
       'Greece', 'Italy', 'Middle_East', 'Others', 'Poland', 'medium_city',
       'small_city', 'countryside', 'mig_german'],
      dtype='object')

In [17]:
X.shape

(378854, 15)

In [13]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size = 0.20, random_state = 5)
print(X_train.shape)
print(X)

In [18]:
#Use statsmodels to assess variables

logit_model=sm.MNLogit(y,sm.add_constant(X))
result=logit_model.fit()
stats1=result.summary()
print(stats1)

         Current function value: 1.367794
         Iterations: 35




                          MNLogit Regression Results                          
Dep. Variable:            occ_choices   No. Observations:               378854
Model:                        MNLogit   Df Residuals:                   378758
Method:                           MLE   Df Model:                           90
Date:                Sat, 19 Dec 2020   Pseudo R-squ.:                  0.1423
Time:                        17:34:13   Log-Likelihood:            -5.1819e+05
converged:                      False   LL-Null:                   -6.0417e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
     occ_choices=civil_servant       coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
const                             -2.2749      0.151    -15.054      0.000      -2.571      -1.979
male                              -0.4007      0.016    -24.928      0.