In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import logit, probit, poisson, ols

In [2]:
import scipy as scp
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn import metrics 
from sklearn.metrics import confusion_matrix

# Load data

In [3]:
occupation = pd.read_csv('occupation_data.csv')

In [5]:
conditions = [occupation['country_origin']=='Germany',
              occupation['country_origin']=='Turkey',
              occupation['country_origin']=='Italy',
              occupation['country_origin']== 'Former Yugo',
              occupation['country_origin']== 'Other European',
              occupation['country_origin']== 'Greece',
              occupation['country_origin']== 'Poland',
              occupation['country_origin']== 'Africa',
              occupation['country_origin']== 'Eastern Europe',
              occupation['country_origin']== 'Middle East',
              occupation['country_origin']== 'Others'
             ]
choices  = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

occupation['country'] =  np.select(conditions,choices,default=np.nan)

In [6]:
migconditions = [occupation['migback']==1,
              occupation['migback']==3]
migchoices = [0, 1]
occupation['migration'] =  np.select(migconditions,migchoices,default=np.nan)

In [7]:
occupation['male'] = np.where(occupation['sex'] == 1, 1, 0)

### Basic model

$$ Occupation = Sex + bioage + country + migback*country + locchild$$

In [8]:
basic = occupation[['occ_choices','male','bioage','country','migration','locchildh']]

In [9]:
basic = basic[~basic['locchildh'].isna()]

In [10]:
basic.shape

(401873, 6)

In [11]:
basic['occ_choices'].value_counts()

white_collar       159816
blue_collar         74133
unemployed          71337
self_employment     31083
schooling           25308
civil_servant       23609
apprentice          14946
military             1641
Name: occ_choices, dtype: int64

In [12]:
basic['male'].value_counts()

0    210674
1    191199
Name: male, dtype: int64

In [13]:
X = basic.drop('occ_choices', axis=1) 
y = basic['occ_choices']

In [16]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size = 0.20, random_state = 5)

In [17]:
#Use statsmodels to assess variables

logit_model=sm.MNLogit(y_train,sm.add_constant(X_train))
result=logit_model.fit()
stats1=result.summary()
print(stats1)

Optimization terminated successfully.
         Current function value: 1.449705
         Iterations 10
                          MNLogit Regression Results                          
Dep. Variable:            occ_choices   No. Observations:               321498
Model:                        MNLogit   Df Residuals:                   321456
Method:                           MLE   Df Model:                           35
Date:                Thu, 17 Dec 2020   Pseudo R-squ.:                  0.1313
Time:                        21:08:25   Log-Likelihood:            -4.6608e+05
converged:                       True   LL-Null:                   -5.3650e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
    occ_choices=blue_collar       coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const                          -6.6103      0.062   -106.595      0.000  