In [220]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import f1_score, make_scorer
import matplotlib.pyplot as plt
from sklearn.decomposition import KernelPCA, PCA
sns.set(context="paper", font="monospace")

In [4]:
data = pd.read_csv('_847b6cd041836e8fa914243e83d898fb_orange_small_churn_data.txt')
labels = pd.read_csv('_ce5e9e4280a8856218f1886f007de2a4_orange_small_churn_labels.txt', header=None)

In [7]:
data.shape, labels.shape

((40000, 230), (40000, 1))

In [95]:
labels[0].value_counts()
count = pd.DataFrame(data.isnull().sum(axis=0))

more_38000 = count[count[0] < 38000].index
less_38000 = count[count[0] >= 38000].index

In [132]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder


In [136]:
# encode categorical data
data.iloc[:, 190:] = data.iloc[:, 190:].apply(lambda x: LabelEncoder().fit_transform(x.astype(str)))

In [176]:
# replace nans with mean in numerical data
data_mean = data.copy()
data_mean.iloc[:, :190] = data_mean.iloc[:, :190].fillna(data_mean.iloc[:, :190].mean())
# delete completely null columns
df = data_mean.isnull().all(axis=0)
null_columns = df[df].index
data_mean = data_mean.drop(null_columns, axis=1)

In [203]:
# train random forest only on 77 features
cls = RandomForestClassifier(n_estimators=200)
cross_val_score(cls, data_mean[more_38000].values, labels.values.reshape(40000,), scoring=make_scorer(f1_score)).mean()

0.0013420570418144989

In [147]:
from sklearn.svm import LinearSVC

In [184]:
# train linear model on all features
cls = LinearSVC(penalty='l1', dual=False)
cls.fit(data_mean.values, labels.values.reshape(40000,))
print(cls.coef_)
#cross_val_score(cls, data_mean.values, labels.values.reshape(40000,)).mean()

[[ -1.09281751e-03  -2.34447238e-02  -5.79887955e-08   4.67241978e-04
   -9.88464875e-09  -1.74836780e-06  -3.06244685e-03  -7.67846395e-05
    1.87380716e-08  -5.73778708e-03   1.82009145e-03   3.73907655e-07
   -1.05872138e-02  -4.07735862e-05   2.65801677e-06   4.52262848e-03
    2.10227328e-02  -1.48775084e-04   2.72500512e-05   1.43511542e-04
   -4.86446906e-05   7.36365907e-05  -2.09138409e-03  -5.21951682e-02
    2.49989470e-05  -7.97746196e-02  -9.57618952e-04  -2.14971787e-07
   -9.88036566e-03   0.00000000e+00  -4.63197862e-08  -1.09750628e-08
   -1.25034771e-09   2.39939544e-03   9.21889685e-05  -7.01403075e-03
   -1.82036757e-03  -8.62396033e-06   1.95118631e-03  -3.01590769e-03
   -1.11669748e-01   2.05709587e-04   6.11942185e-08   3.63049051e-08
    2.13495389e-04   1.68105596e-10  -4.98579663e-04  -2.40159632e-08
   -5.04120255e-08   2.68813890e-03   3.87804275e-04  -6.05037022e-03
   -4.62078303e-05   6.01387366e-07   5.86728163e-04  -1.19876285e-04
   -3.33729302e-02  

In [189]:
linear_features = np.where(cls.coef_[0] > 10 ** -4)

In [205]:
# linear model on linear features
cls = LinearSVC()
cross_val_score(cls, data_mean.values[:, linear_features[0]], labels.values.reshape(40000,), scoring=make_scorer(f1_score)).mean()

0.012315582281452589

In [226]:
# replace nans with some big value
data_negative = data.copy()
data_negative.iloc[:, :190] = data_negative.iloc[:, :190].fillna(-10**10)

# delete completely null columns
df = data_negative.isnull().all(axis=0)
null_columns = df[df].index
data_negative = data_negative.drop(null_columns, axis=1)

pca = PCA(100)
data_negative = pca.fit_transform(data_negative)
print(data_negative.shape, 'shape after PCA')

# train random forest only on PCA features
cls = RandomForestClassifier(n_estimators=100)
print(cross_val_score(cls, data_negative, labels.values.reshape(40000,), scoring=make_scorer(f1_score)).mean())

(40000, 100) shape after PCA
0.00267537150276


In [218]:
# Next steps may be using ensemble of different classifiers
# using xboost