### Problem statement
1. About the data- Rental bike usuage data
2. We want to predict the churn (customers leaving the product)
<hr>

In [1]:
import pandas as pd

In [4]:
data = pd.read_csv('https://raw.githubusercontent.com/edyoda/data-science-complete-tutorial/master/Data/churn.csv.txt', parse_dates=['signup_date', 'last_trip_date'])

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 12 columns):
avg_dist                  50000 non-null float64
avg_rating_by_driver      49799 non-null float64
avg_rating_of_driver      41878 non-null float64
avg_surge                 50000 non-null float64
city                      50000 non-null object
last_trip_date            50000 non-null datetime64[ns]
phone                     49604 non-null object
signup_date               50000 non-null datetime64[ns]
surge_pct                 50000 non-null float64
trips_in_first_30_days    50000 non-null int64
luxury_car_user           50000 non-null bool
weekday_pct               50000 non-null float64
dtypes: bool(1), datetime64[ns](2), float64(6), int64(1), object(2)
memory usage: 4.2+ MB


### Is there any chrun column in the data

1. Many time the target data is directly not available
2. This can be derived from the feature column
3. From the data, we need to identify the date on which the data is dowloaded

In [8]:
data.last_trip_date.min()

Timestamp('2014-01-01 00:00:00')

In [7]:
data.last_trip_date.max()

Timestamp('2014-07-01 00:00:00')

In [10]:
import datetime

In [13]:
cutooff_data = data.last_trip_date.max()- datetime.timedelta(30)

In [14]:
cutooff_data

Timestamp('2014-06-01 00:00:00')

In [15]:
data['churn'] = data.last_trip_date.map(lambda d: "Not churn"  if d>cutooff_data else 'churn' )

In [16]:
data.sample(10)

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,city,last_trip_date,phone,signup_date,surge_pct,trips_in_first_30_days,luxury_car_user,weekday_pct,churn
36527,8.75,4.0,5.0,1.0,Astapor,2014-05-28,iPhone,2014-01-19,0.0,0,True,100.0,churn
29032,1.91,5.0,5.0,1.0,Astapor,2014-01-31,Android,2014-01-25,0.0,1,True,100.0,churn
6072,4.12,4.8,4.9,1.07,Winterfell,2014-06-07,iPhone,2014-01-03,7.9,10,True,44.7,Not churn
38097,8.21,4.6,5.0,1.07,Winterfell,2014-06-13,iPhone,2014-01-11,10.0,2,True,60.0,Not churn
12115,2.53,5.0,,1.0,Winterfell,2014-02-02,iPhone,2014-01-25,0.0,1,True,0.0,churn
31818,3.06,4.4,4.5,1.16,King's Landing,2014-06-04,Android,2014-01-31,25.0,1,False,75.0,Not churn
46780,13.1,5.0,,1.0,King's Landing,2014-06-01,Android,2014-01-06,0.0,0,False,0.0,churn
29121,4.71,4.5,4.5,1.0,Winterfell,2014-05-30,Android,2014-01-31,0.0,3,False,100.0,churn
18492,2.3,5.0,5.0,1.25,Astapor,2014-06-01,Android,2014-01-27,33.3,0,False,0.0,churn
34938,8.63,5.0,4.0,1.0,Winterfell,2014-05-26,iPhone,2014-01-02,0.0,1,False,66.7,churn


### Handling the missing values and pipeline systems

In [17]:
float_churn_data = data.select_dtypes(include = ['float64','bool'])

In [18]:
cat_churn_data = data.select_dtypes(include=['O'])

In [30]:
int_churn_data = data[['phone','city']]

In [22]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

In [46]:
float_pipeline = make_pipeline(SimpleImputer(strategy= 'Median'),MinMaxScaler())

In [27]:
from sklearn.preprocessing import OrdinalEncoder

In [29]:
cat_churn_data.inf

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 3 columns):
city     50000 non-null object
phone    49604 non-null object
churn    50000 non-null object
dtypes: object(3)
memory usage: 1.1+ MB


In [32]:
cat_pipeline = make_pipeline(SimpleImputer(strategy = 'most_frequent'), OrdinalEncoder())

In [33]:
int_pipline = make_pipeline(MinMaxScaler())

In [38]:
data['subscription_dates']= data.signup_date- data.last_trip_date

In [39]:
data['subscription_dates'] = data['subscription_dates'].dt.days

In [40]:
int_churn_data = data.select_dtypes(include = ['int64'])

In [42]:
int_churn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
trips_in_first_30_days    50000 non-null int64
subscription_dates        50000 non-null int64
dtypes: int64(2)
memory usage: 781.4 KB


In [44]:
from sklearn.compose import make_column_transformer

In [52]:
preprocessor = make_column_transformer(
    (int_pipline, int_churn_data.columns),
    (cat_pipeline, cat_churn_data.columns),
    (float_pipeline,float_churn_data.columns)
    
)

In [49]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.naive_bayes import GaussianNB

In [54]:
pipline = make_pipeline(preprocessor, RandomForestRegressor())

In [50]:
help(make_pipeline)

Help on function make_pipeline in module sklearn.pipeline:

make_pipeline(*steps, **kwargs)
    Construct a Pipeline from the given estimators.
    
    This is a shorthand for the Pipeline constructor; it does not require, and
    does not permit, naming the estimators. Instead, their names will be set
    to the lowercase of their types automatically.
    
    Parameters
    ----------
    *steps : list of estimators.
    
    memory : None, str or object with the joblib.Memory interface, optional
        Used to cache the fitted transformers of the pipeline. By default,
        no caching is performed. If a string is given, it is the path to
        the caching directory. Enabling caching triggers a clone of
        the transformers before fitting. Therefore, the transformer
        instance given to the pipeline cannot be inspected
        directly. Use the attribute ``named_steps`` or ``steps`` to
        inspect estimators within the pipeline. Caching the
        transformers is 

In [55]:
from sklearn.model_selection import train_test_split

In [56]:
data

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,city,last_trip_date,phone,signup_date,surge_pct,trips_in_first_30_days,luxury_car_user,weekday_pct,churn,subscription_dates
0,3.67,5.0,4.7,1.10,King's Landing,2014-06-17,iPhone,2014-01-25,15.4,4,True,46.2,Not churn,-143
1,8.26,5.0,5.0,1.00,Astapor,2014-05-05,Android,2014-01-29,0.0,0,False,50.0,churn,-96
2,0.77,5.0,4.3,1.00,Astapor,2014-01-07,iPhone,2014-01-06,0.0,3,False,100.0,churn,-1
3,2.36,4.9,4.6,1.14,King's Landing,2014-06-29,iPhone,2014-01-10,20.0,9,True,80.0,Not churn,-170
4,3.13,4.9,4.4,1.19,Winterfell,2014-03-15,Android,2014-01-27,11.8,14,False,82.4,churn,-47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,5.63,4.2,5.0,1.00,King's Landing,2014-06-05,iPhone,2014-01-25,0.0,0,False,100.0,Not churn,-131
49996,0.00,4.0,,1.00,Astapor,2014-01-25,iPhone,2014-01-24,0.0,1,False,0.0,churn,-1
49997,3.86,5.0,5.0,1.00,Winterfell,2014-05-22,Android,2014-01-31,0.0,0,True,100.0,churn,-111
49998,4.58,3.5,3.0,1.00,Astapor,2014-01-15,iPhone,2014-01-14,0.0,2,False,100.0,churn,-1


In [59]:
trainX, testX, trainY, testY = train_test_split(data.drop(columns =['churn']), data.churn)

In [62]:
data['churn'] = data.last_trip_date.map(lambda d: "Not churn"  if d>cutooff_data else 'churn' )

In [63]:
pipline.fit(trainX, trainY)

ValueError: 'churn' is not in list