In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas as pd
import numpy as np

train = pd.read_csv('data/Train.csv')
test = pd.read_csv('data/Test.csv')
label = train.CHURN
user = train.user_id

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [4]:
class ArpRevTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        new_X = X.copy()
        
        try:
            new_X.drop(['user_id', 'CHURN'], axis=1, inplace=True)
        except KeyError:
            new_X.drop(['user_id',], axis=1, inplace=True)
        
        ten = sorted(new_X.TENURE.unique())
        new_X['TENURE'] = pd.Categorical(new_X['TENURE'], categories=ten, ordered=True)
        
        new_X['REVENUE'] = new_X.apply(lambda row: row['ARPU_SEGMENT']*3 if np.isnan(row['REVENUE']) else row['REVENUE'], axis=1)
        new_X['ARPU_SEGMENT'] = new_X.apply(lambda row: row['REVENUE']/3 if np.isnan(row['ARPU_SEGMENT']) else row['ARPU_SEGMENT'], axis=1)
        
        for column in new_X.columns:
            if (new_X[column].isna().sum()/len(new_X) * 100) > 40 or new_X[column].nunique() == 1:
                new_X.drop(column, axis=1, inplace=True)

        
        return new_X

In [5]:


class FillMissingValues(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        new_X = X.copy()
        numerical_ix = new_X.select_dtypes(include=['int64', 'float64']).columns
        categorical_ix = df.select_dtypes(include=['object', 'bool']).columns
        ordinal_ix = df.select_dtypes(include=['category']).columns
        
        imp = SimpleImputer(strategy='most_frequent')
        
        new_X[numerical_ix].fillna(new_X[numerical_ix].median(), inplace=True)
        new_X = imp.fit_transform(new_X)
        
        new_X = pd.DataFrame(new_X, columns=X.columns)
        
        
        
        return new_X

In [6]:
df = ArpRevTransformer().fit_transform(train)

In [7]:
numerical_ix = df.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = df.select_dtypes(include=['object', 'bool']).columns
ordinal_ix = df.select_dtypes(include=['category']).columns

In [8]:
imputer = [ 
    ('num_input', SimpleImputer(strategy = 'median'), numerical_ix), 
    ('ord_input', SimpleImputer(strategy='most_frequent'), ordinal_ix), 
    ('cat_input', SimpleImputer(strategy = 'most_frequent'), categorical_ix)
]

encoder = [
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_ix), 
    ('ordinal', OrdinalEncoder(), ordinal_ix)
]


# i_transform = ColumnTransformer(transformers=imputer)
e_transform = ColumnTransformer(transformers=encoder)

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [12]:
lr_model = LogisticRegression()
rf_model = RandomForestClassifier(n_jobs=-1, max_depth=10, n_estimators=7, random_state=42)
svc = SVC()
gbc = GradientBoostingClassifier()
scaler = StandardScaler()
cleaner = ArpRevTransformer()
i_transform = FillMissingValues()

In [13]:
rf_pipe = Pipeline(steps=[
    ('clean', cleaner),
    ('imputer', i_transform),
    ('encoder', e_transform),
    ('scale', scaler),
    ('model', rf_model)
])

In [14]:
rf_pipe.fit(train, label)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


Pipeline(memory=None,
     steps=[('clean', ArpRevTransformer()), ('imputer', FillMissingValues()), ('encoder', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('cat', OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class ...stimators=7, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False))])

In [15]:
rf_predict_proba = rf_pipe.predict_proba(test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [16]:
lr_pipe = Pipeline(steps=[
    ('clean', cleaner),
    ('imputer', i_transform),
    ('encoder', e_transform),
    ('scale', scaler),
    ('model',lr_model)
])

In [26]:
lr_pipe.fit(train, label)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


Pipeline(memory=None,
     steps=[('clean', ArpRevTransformer()), ('imputer', FillMissingValues()), ('encoder', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('cat', OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class ...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [55]:
lr_predict_proba = lr_pipe.predict_proba(test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [59]:
# svc_pipe = Pipeline(steps=[
#     ('clean', cleaner),
#     ('imputer', i_transform),
#     ('encoder', e_transform),
#     ('scale', scaler),
#     ('model',svc)
# ])

In [60]:
# svc_pipe.fit(train, label)

In [61]:
# svc_predict_proba = svc_pipe.predict_proba(test)

In [62]:
gbc_pipe = Pipeline(steps=[
    ('clean', cleaner),
    ('imputer', i_transform),
    ('encoder', e_transform),
    ('scale', scaler),
    ('model',gbc)
])

In [64]:
gbc_pipe.fit(train, label)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


Pipeline(memory=None,
     steps=[('clean', ArpRevTransformer()), ('imputer', FillMissingValues()), ('encoder', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('cat', OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class ...    subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False))])

In [65]:
gbc_pred = gbc_pipe.predict_proba(test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [71]:
estimators = [('logistic_regression', lr_model), ('random_forest', rf_model), ('gbc', gbc)]

In [108]:
vote = VotingClassifier(estimators=estimators, voting='soft')

In [109]:
voting = Pipeline(steps=[
    ('clean', cleaner),
    ('imputer', i_transform),
    ('encoder', e_transform),
    ('scale', scaler),
    ('model',vote)
])

In [110]:
voting.fit(train, label)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


Pipeline(memory=None,
     steps=[('clean', ArpRevTransformer()), ('imputer', FillMissingValues()), ('encoder', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('cat', OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class ...0, warm_start=False))],
         flatten_transform=None, n_jobs=None, voting='soft', weights=None))])

In [111]:
import pickle

In [67]:
with open('data/lr.pkl', 'wb') as f:
    pickle.dump(lr_pipe, f)

In [68]:
with open('data/rf.pkl', 'wb') as f:
    pickle.dump(rf_pipe, f)

In [69]:
with open('data/gbc.pkl', 'wb') as f:
    pickle.dump(gbc_pipe, f)

In [112]:
with open('data/vote_soft.pkl', 'wb') as f:
    pickle.dump(voting, f)

In [78]:
y = test

In [113]:
lr = lr_pipe.predict_proba(y)
rf = rf_pipe.predict_proba(y)
gbc = gbc_pipe.predict_proba(y)
v = voting.predict_proba(y)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [87]:
y_id = y.user_id

In [115]:
# LR = pd.DataFrame({'user_id': y_id, 'CHURN': lr[:,1]})
# RF = pd.DataFrame({'user_id': y_id, 'CHURN': rf[:,1]})
# GBC = pd.DataFrame({'user_id': y_id, 'CHURN': gbc[:,1]})
VOTE = pd.DataFrame({'user_id': y_id, 'CHURN': v[:,1]})

In [116]:
LR.head()

Unnamed: 0,user_id,CHURN
0,af900d87e73b7ff6509d2203df4704a98aa5f2a6,0.281049
1,5335efd940280b82143272275637d1e65d37eadb,0.281049
2,a581f4fa08677c26f83f643248c667e241043086,0.281049
3,64f67177d0775262b8087a9e2e3b8061b6324ae6,0.281049
4,0d6009a4594c4be22449b8d9cc01a0bcea98faea,0.281049


In [117]:
RF.head()

Unnamed: 0,user_id,CHURN
0,af900d87e73b7ff6509d2203df4704a98aa5f2a6,0.280254
1,5335efd940280b82143272275637d1e65d37eadb,0.280254
2,a581f4fa08677c26f83f643248c667e241043086,0.280254
3,64f67177d0775262b8087a9e2e3b8061b6324ae6,0.280254
4,0d6009a4594c4be22449b8d9cc01a0bcea98faea,0.280254


In [118]:
GBC.head()

Unnamed: 0,user_id,CHURN
0,af900d87e73b7ff6509d2203df4704a98aa5f2a6,0.279992
1,5335efd940280b82143272275637d1e65d37eadb,0.279992
2,a581f4fa08677c26f83f643248c667e241043086,0.279992
3,64f67177d0775262b8087a9e2e3b8061b6324ae6,0.279992
4,0d6009a4594c4be22449b8d9cc01a0bcea98faea,0.279992


In [119]:
VOTE.head()

Unnamed: 0,user_id,CHURN
0,af900d87e73b7ff6509d2203df4704a98aa5f2a6,0.280432
1,5335efd940280b82143272275637d1e65d37eadb,0.280432
2,a581f4fa08677c26f83f643248c667e241043086,0.280432
3,64f67177d0775262b8087a9e2e3b8061b6324ae6,0.280432
4,0d6009a4594c4be22449b8d9cc01a0bcea98faea,0.280432


In [120]:
LR.to_csv('data\logistic_regression.csv', index=False)

In [121]:
RF.to_csv('data/random_forest.csv', index=False)

In [122]:
GBC.to_csv('data\gradient_boosting__decent.csv', index=False)

In [124]:
VOTE.to_csv('data/voting_classifier.csv', index=False)