In [121]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

import date_transformer
import FeatureTransformer
import column_extractor
import categorical_transformer
import cleanup_transformer
import ModelTransformer
import classification_model
import pandas_column_utilities
import Parallelize

import pandas as pd
import numpy as np
from datetime import datetime, timedelta


categorical_transformer = reload(categorical_transformer)

#features with date, category manipulation without feature importance
def get_basic_features(df, ordinal, categorical,date_manip ,cont):
    date_pip = Pipeline([('extract',column_extractor.column_extractor(date_manip)),
                ('date_manip', date_transformer.date_transformer('%Y-%m-%d')),
                ('d-m-y-q-dow', Parallelize.Parallelize([('day',date_transformer.day_of_month_transformer()),
                                            ('month',date_transformer.month_transformer()),
                                            ('dow',date_transformer.day_of_week_transformer()),
                                            ('quarter',date_transformer.month_quarter_transformer()),
                                            ('year',date_transformer.year_transformer())])),
                ('impute',FeatureTransformer.imputing_transformer(Imputer(strategy='most_frequent')))])

    continuous = Pipeline([
            ('extract', column_extractor.column_extractor(cont)),
            ('impute',FeatureTransformer.imputing_transformer(Imputer(strategy='most_frequent'))),
            ('scale', ModelTransformer.ModelDataframeWrapper(Normalizer()))])

    ordinal_pip = Pipeline([('extract', column_extractor.column_extractor(ordinal)),
                            ('ord', categorical_transformer.ordinal_transformer(ordinal)),
                           ('impute',FeatureTransformer.imputing_transformer(Imputer(strategy='most_frequent')))])

    one_hot = Pipeline([('extract', column_extractor.column_extractor(categorical)),
                        ('one_hot', FeatureTransformer.HotEncoder())])


    features = Pipeline([('parallel', Parallelize.Parallelize([('date',date_pip),
                                                    ('continuous',continuous),
                                                    ('ordinal_pip',ordinal_pip),
                                                    ('one_hot',one_hot)])),
                        ('cleanup',cleanup_transformer.cleanup_transformer())])

    return features, features.transform(df)

In [2]:
df = pd.read_csv("/Users/predev/Documents/Learn/MachineLearning/Galvanize/Case_Study_21-2/Data/churn_train.csv")
pandas_column_utilities.info(df)

Unnamed: 0_level_0,Count,Unnamed: 2_level_0,Type,Possible Values,Few Values
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
avg_dist,40000,non-null,float64,,"[6.94, 8.06, 21.5, 9.46, 13.77, 14.51, 4.87, 1..."
avg_rating_by_driver,39838,non-null,float64,,"[5.0, 4.0, 4.7, 4.8, 4.3, 4.6, 4.9, 4.4, 3.0, ..."
avg_rating_of_driver,33472,non-null,float64,,"[5.0, nan, 4.0, 4.7, 3.0, 3.7, 4.5, 3.3, 3.6, ..."
avg_surge,40000,non-null,float64,,"[1.0, 2.75, 2.0, 1.15, 1.02, 1.11, 1.2, 1.03, ..."
city,40000,non-null,object,,"[Astapor, Winterfell, King's Landing]"
last_trip_date,40000,non-null,object,{DATE},"[2014-05-03, 2014-01-26, 2014-05-21, 2014-01-1..."
phone,39681,non-null,object,,"[Android, iPhone, nan]"
signup_date,40000,non-null,object,{DATE},"[2014-01-12, 2014-01-25, 2014-01-02, 2014-01-0..."
surge_pct,40000,non-null,float64,,"[0.0, 100.0, 20.0, 9.1, 2.6, 14.3, 28.6, 11.1,..."
trips_in_first_30_days,40000,non-null,int64,,"[0, 2, 1, 3, 9, 6, 4, 5, 10, 12, 27, 7, 8, 15,..."


None


In [94]:
ordinal = {'luxury_car_user':{False:0,True:1}}
categorical = ['city','phone']
date_manip = ['signup_date']
cont = ['avg_dist','avg_rating_by_driver','avg_rating_of_driver','avg_surge','surge_pct','trips_in_first_30_days','weekday_pct']

date_la = datetime.strptime('2014-07-01','%Y-%m-%d')
df['active'] = df['last_trip_date'].apply(lambda x: 1 if date_la - datetime.strptime(x, '%Y-%m-%d') <= timedelta(days = 30) else 0)

In [120]:
dd = get_basic_features(df, ordinal, categorical,date_manip ,cont)[1]
print dd.columns

Index([u'signup_date_dow', u'signup_date_dom', u'luxury_car_user', u'avg_dist',
       u'avg_rating_by_driver', u'avg_rating_of_driver', u'avg_surge',
       u'surge_pct', u'trips_in_first_30_days', u'weekday_pct',
       u'city_Astapor', u'city_King's Landing', u'city_Winterfell',
       u'phone_Android', u'phone_iPhone'],
      dtype='object')


In [122]:
select_features = SelectKBest(chi2, k=10)
select_features.fit(dd, df['active'].values)
select_features.transform(dd)

array([[ 0.        ,  0.06905844,  0.04975392, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.        ,  0.73588559,  0.45650471, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.        ,  0.20976608,  0.03902625, ...,  0.        ,
         0.        ,  1.        ],
       ..., 
       [ 1.        ,  0.08856797,  0.145671  , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.        ,  0.04808052,  0.05923961, ...,  0.        ,
         0.        ,  1.        ],
       [ 1.        ,  0.0972318 ,  0.10752693, ...,  0.        ,
         0.        ,  1.        ]])

In [126]:
dd[select_features.get_support(indices=True)]

Unnamed: 0,luxury_car_user,avg_dist,avg_rating_by_driver,avg_rating_of_driver,surge_pct,weekday_pct,city_Astapor,city_King's Landing,phone_Android,phone_iPhone
0,0.0,0.069058,0.049754,0.049754,0.000000,0.995078,1,0,1,0
1,1.0,0.735886,0.456505,0.456505,0.000000,0.000000,1,0,1,0
2,1.0,0.209766,0.039026,0.048783,0.000000,0.975656,0,0,0,1
3,0.0,0.066646,0.035225,0.035225,0.704504,0.704504,0,0,1,0
4,0.0,0.136073,0.049409,0.049409,0.000000,0.988182,0,0,0,1
5,1.0,0.143239,0.049359,0.049359,0.000000,0.987174,1,0,0,1
6,0.0,0.072504,0.069973,0.059551,0.000000,0.993015,1,0,1,0
7,0.0,0.241851,0.543484,0.679355,0.000000,0.000000,0,0,0,1
8,0.0,0.101812,0.074099,0.069653,0.000000,0.988482,1,0,1,0
9,0.0,0.024050,0.049896,0.029937,0.000000,0.997916,0,0,0,1


In [25]:
def distance(x ,y):
    return np.sqrt(sum([(a - b) ** 2 for a,b in zip(x,y)]))

def complete_linkage(cluster, point) :
    return np.max([distance(x, point) for x in clusters])

In [68]:
class Edge:
    def __init__(self, x1, x2, dist=distance):
        self.x1 = x1
        self.x2 = x2
        self.distance = dist(x1.points, x2.points)
        self.hash_val = hash(self)
   
    def __str__(self):
        return  str(self.x1)+" " +str(self.x2) + " dist = "+str(self.distance)

class point_cluster:
    def __init__(self, x, ptno, cluster = False):
        self.points = x
        self.ptno = ptno
        self.iscluster = cluster
        self.edges = dict()
       
    def addEdge(self, edge) :
        self.edges[edge.hash_val] = edge
        
    def __str__(self):
        return  str(self.ptno)

In [65]:
x = [[0,0],[0,1],[1,1]]

points = []
i = 0
for coord in x :
   points.append(point_cluster(coord, i) )
   i += 1

In [69]:
edges = []
for p1 in points :
    for p2 in points :
        if(p1 != p2) :
            edge = Edge(p1,p2)
            edges.append(edge)
            p1.addEdge(edge)
            p2.addEdge(edge)

In [70]:
[str(e) for e  in edges]

['0 1 dist = 1.0',
 '0 2 dist = 1.41421356237',
 '1 0 dist = 1.0',
 '1 2 dist = 1.0',
 '2 0 dist = 1.41421356237',
 '2 1 dist = 1.0']

In [76]:
edges = sorted(edges, key=lambda x:x.distance)
[str(e) for e  in edges]

['0 1 dist = 1.0',
 '1 0 dist = 1.0',
 '1 2 dist = 1.0',
 '2 1 dist = 1.0',
 '0 2 dist = 1.41421356237',
 '2 0 dist = 1.41421356237']

In [84]:
def complete_linkage(x1, x2, distance=distance) :
    pt1 = x1.points if x1.iscluster == True else [x1.points]
    pt2 = x2.points if x2.iscluster == True else [x2.points]
    
    max_dist = 0
    
    for p1 in pt1 :
        for p2 in pt2 :
            dist = distance(p1, p2)
            if(max_dist < dist) :
                max_dist = dist
    return max_dist

In [85]:
complete_linkage(edges[0].x1, edges[0].x2)

1.0