In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

In [2]:
df = pd.read_stata('../../../HCMST 2017 fresh sample for public sharing draft v1.1.dta')

In [3]:
chosen_cols = [
    'time_from_met_to_rel',  # numeryczna - czas (w latach), który upłynął od poznania do wejścia w związek
    'time_from_rel_to_cohab', # numeryczna - czas (w latach), który upłynął od wejścia w związek do zamieszkania razem
    'Q25',  # jakościowa - czy uczęszczałeś z partnerem do tej samej szkoły średniej
    'Q12', # jakościowa - jak partner opisuje swoje poglądy polityczne
    'S1',  # output
]
df[chosen_cols].head()

Unnamed: 0,time_from_met_to_rel,time_from_rel_to_cohab,Q25,Q12,S1
0,0.0,,,,"No, I am not Married"
1,12.25,0.5,Different High School,Leans Republican,"Yes, I am Married"
2,0.416748,0.083252,Different High School,Leans Democrat,"Yes, I am Married"
3,1.083252,,,,"No, I am not Married"
4,0.083252,0.833374,Different High School,Strong Democrat,"Yes, I am Married"


In [4]:
# kilka parametrów danych
print(df[chosen_cols].count())  # liczba niepustych wartości w każdej kolumnie 
print('=' * 30)
print('before drop:', len(df[chosen_cols]))
print('after drop:', len(df[chosen_cols].dropna()))  # liczba wierszy przed i po usunięciu niekompletnych wierszy

cleaner_df = df[chosen_cols].dropna()
cleaner_df['S1'] = cleaner_df['S1'].apply(lambda x: 1 if x == 'Yes, I am Married' else 0)
cleaner_df['S1'] = cleaner_df['S1'].astype('float')  # przekształcenie kolumny wynikowej do postaci 0/1
final_df = cleaner_df
final_df.head()

time_from_met_to_rel      3263
time_from_rel_to_cohab    2578
Q25                       2856
Q12                       2856
S1                        3510
dtype: int64
before drop: 3510
after drop: 2354


Unnamed: 0,time_from_met_to_rel,time_from_rel_to_cohab,Q25,Q12,S1
1,12.25,0.5,Different High School,Leans Republican,1.0
2,0.416748,0.083252,Different High School,Leans Democrat,1.0
4,0.083252,0.833374,Different High School,Strong Democrat,1.0
5,0.5,0.0,Different High School,Undecided/Independent/Other,1.0
6,0.25,2.583374,Different High School,Leans Republican,1.0


In [5]:
X = final_df.loc[:, final_df.columns != 'S1']
y = final_df.loc[:, 'S1']


In [7]:
clf = RandomForestClassifier(**{'max_depth': 7, 'max_features': 3, 'min_samples_split': 2, 'n_estimators': 22},
                             n_jobs=-1)
X_train = pd.get_dummies(X).sort_index(axis=1)
y_train = y
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=7, max_features=3, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=22, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [45]:
chosen_row = X.loc[1398]
chosen_row

time_from_met_to_rel                          3
time_from_rel_to_cohab                     4.25
Q25                       Different High School
Q12                           Strong Republican
Name: 1398, dtype: object

In [61]:
tfmtr_sigma = .5
tfrtc_sigma = .5
q25_vals = list(X.Q25.unique())
q12_vals = list(X.Q12.unique())

In [62]:
import random
# generowanie 1000 próbek podobnych do wyjściowej i zapisanie ich w dataframie
generated_sample = [[np.random.normal(chosen_row.time_from_met_to_rel, tfmtr_sigma),
                    np.random.normal(chosen_row.time_from_rel_to_cohab, tfrtc_sigma),
                    random.choice(q25_vals),
                    random.choice(q12_vals),] for _ in range(1000)]
generated_sample_dict = dict(zip(X.columns, list(zip(*generated_sample))))
generated_sample_df = pd.DataFrame(generated_sample_dict)

In [63]:
# przekształcenie danych do formatu wejściowego do modelu, wyliczenie predykcji modelu i wag
generated_sample_df = pd.get_dummies(generated_sample_df)[X_train.columns]
gs_preds = clf.predict_proba(generated_sample_df)[:, 1]
gs_weights = 1 / np.apply_along_axis(np.linalg.norm, 1, np.array(X_train.loc[1398]) - np.array(generated_sample_df))

In [71]:
# wytrenowanie modelu białej skrzynki - ważonej regresji liniowej
from sklearn.linear_model import LinearRegression
white_box_model = LinearRegression()
white_box_model.fit(generated_sample_df, gs_preds, sample_weight=gs_weights)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [76]:
# wypisanie wag białej skrzynki dla poszczególnych zmiennych
for param, coef in zip(generated_sample_df.columns, white_box_model.coef_):
    print(f"{param}: {coef}")
print(f"constant term: {white_box_model.intercept_}")

Q12_Leans Democrat: -489735419399.629
Q12_Leans Republican: -489735419399.59094
Q12_Not Strong Democrat: -489735419399.61273
Q12_Not Strong Republican: -489735419399.5441
Q12_Refused: -489735419399.60516
Q12_Strong Democrat: -489735419399.58136
Q12_Strong Republican: -489735419399.57733
Q12_Undecided/Independent/Other: -489735419399.6792
Q25_Different High School: -305647814073.93524
Q25_Refused: -305647814073.9081
Q25_Same High School: -305647814073.9256
time_from_met_to_rel: -0.008701709416007722
time_from_rel_to_cohab: -0.0027034284714271134
constant term: 795383233474.4426


In [88]:
# analogiczne kroki, jednak przy ustalonych zmiennych jakościowych
x_train_nonnumeric = dict(X_train.loc[1398, ['Q12_Leans Democrat', 'Q12_Leans Republican', 'Q12_Not Strong Democrat',
       'Q12_Not Strong Republican', 'Q12_Refused', 'Q12_Strong Democrat',
       'Q12_Strong Republican', 'Q12_Undecided/Independent/Other',
       'Q25_Different High School', 'Q25_Refused', 'Q25_Same High School']])
new_generated_sample_dict = {x: [x_train_nonnumeric[x] for _ in range(1000)] for x in x_train_nonnumeric}
new_generated_sample_dict['time_from_met_to_rel'] = [np.random.normal(chosen_row.time_from_met_to_rel, tfmtr_sigma) for _ in range(1000)]
new_generated_sample_dict['time_from_rel_to_cohab'] = [np.random.normal(chosen_row.time_from_rel_to_cohab, tfrtc_sigma) for _ in range(1000)]
new_generated_sample_df = pd.DataFrame(new_generated_sample_dict)
new_generated_sample_df = new_generated_sample_df[X_train.columns]

In [89]:
ngs_preds = clf.predict_proba(new_generated_sample_df)[:, 1]
ngs_weights = 1 / np.apply_along_axis(np.linalg.norm, 1, np.array(X_train.loc[1398]) - np.array(new_generated_sample_df))

In [90]:
from sklearn.linear_model import LinearRegression
new_white_box_model = LinearRegression()
new_white_box_model.fit(new_generated_sample_df, ngs_preds, sample_weight=ngs_weights)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [91]:
for param, coef in zip(new_generated_sample_df.columns, new_white_box_model.coef_):
    print(f"{param}: {coef}")
print(f"constant term: {new_white_box_model.intercept_}")

Q12_Leans Democrat: 0.0
Q12_Leans Republican: 7.305535042165965e-18
Q12_Not Strong Democrat: -1.193151311531375e-17
Q12_Not Strong Republican: 1.3147748429207506e-47
Q12_Refused: 2.5922646642425427e-48
Q12_Strong Democrat: 1.3003784544849074e-63
Q12_Strong Republican: -3.4771057059445286e-63
Q12_Undecided/Independent/Other: 2.83704352469712e-79
Q25_Different High School: -4.957637195972181e-80
Q25_Refused: 9.15364443057667e-96
Q25_Same High School: 2.944482734352086e-96
time_from_met_to_rel: -0.06223550522517227
time_from_rel_to_cohab: 0.004079876301359343
constant term: 1.0885082621826951
