In [2]:
## Importing all the reuired libraries

import pandas as pd 
import numpy as np
from sqlalchemy import create_engine
import pymysql
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
# installing pyarrow to read parquet input files
#pip install pyarrow

Collecting pyarrow
  Using cached https://files.pythonhosted.org/packages/b5/84/ac0c239ffc4cde7c3aa9840ce734b42d4e9100e76927c6ed0100f00de10a/pyarrow-4.0.1-cp37-cp37m-win_amd64.whl
Installing collected packages: pyarrow
Successfully installed pyarrow-4.0.1
Note: you may need to restart the kernel to use updated packages.


# Reading input files

In [4]:
leads= pd.read_parquet('ds_leads.parquet.gzip')

In [9]:
offers= pd.read_parquet('ds_offers.parquet.gzip')

Unnamed: 0,lead_uuid,offer_id,apr,lender_id
33,ae2d5046-a7c7-44fe-b6f4-cde3d8bf29e2,810117850,199.0,1103
35,b12fbb06-1402-4de3-a91f-fb6360ff85e4,810119030,249.0,1103
38,a119a3db-ab14-46fc-acd1-35cf20dec1ec,810122970,249.0,1103
40,3166d6bd-1c79-44c0-867c-889afd35990c,810124218,17.69,240
41,3166d6bd-1c79-44c0-867c-889afd35990c,810124220,17.19,240


In [10]:
clicks= pd.read_parquet('ds_clicks.parquet.gzip')

Unnamed: 0,offer_id,clicked_at
0,810116813,2021-03-23 02:01:48.339
1,810118339,2021-03-23 02:01:14.135
2,810132429,2021-03-23 02:46:49.753
3,810152009,2021-03-23 04:46:19.662
4,810177207,2021-03-23 08:44:04.494


# Writing to mysql db

In [2]:
# Creating mysql engine object to connect to mysql db

engine = create_engine("mysql+pymysql://{user}:{pw}@localhost:3306/{db}"
                       .format(user="root",
                               pw="aaryan007",
                               db="even"))

In [28]:
leads.to_sql(con=engine,name='leads',if_exists='append',index=False)

In [29]:
offers.to_sql(con=engine,name='offers',if_exists='append',index=False)

In [30]:
clicks.to_sql(con=engine,name='clicks',if_exists='append',index=False)

## Reading from mysql db

In [10]:
leads= pd.read_sql("select * from leads", engine.connect())

In [11]:
offers= pd.read_sql("select * from offers", engine.connect())

In [12]:
clicks= pd.read_sql("select * from clicks", engine.connect())

In [13]:
## Joining offers and leads dataframe

offers_leads= pd.merge(offers,leads,how='left',on='lead_uuid')

In [14]:
# checking for rows with null values
offers_leads.isnull().any(axis=1).sum()

578

In [15]:
## dropped the rows with na values as the count of such rows was less than 1% so imputation are not required.
offers_leads.dropna(inplace=True)

In [16]:
##joining offers_leads dataframe with clicks to get which offers were clicked

joined_df= pd.merge(offers_leads,clicks.rename(columns={'offer_id':'offer_id_2'}),how='left',left_on='offer_id',right_on='offer_id_2')

In [17]:
## Considering "limited" and "Unknown" credit means we donot have enough information to generate credit score. 
## I am creating a new column "nocredit" to depict such leads

joined_df['nocredit']=joined_df['credit'].apply(lambda c: 1 if c in ("limited",'unknown') else 0)

In [18]:
## Mapping credit into numerical values in ordinal format.

joined_df['credit_bucket']= joined_df['credit'].map({'limited':0,'unknown':0,'poor':1,'good':2,'fair':3,'excellent':4})

In [19]:
## Fitting and trqansforming one-hot encoding the loan_purpose values as it is an nominal column

enc=OneHotEncoder()
enc.fit_transform(joined_df[['loan_purpose']])

import pickle
with open("encoder", "wb") as f: 
    pickle.dump(enc, f)

In [20]:
joined_df=pd.concat([joined_df,pd.DataFrame(enc.fit_transform(joined_df[['loan_purpose']]).toarray())], axis=1)

In [21]:
# Creating output column based on clicks

joined_df['output']= joined_df['offer_id_2'].fillna(0).apply(lambda k: k if k==0 else 1)

In [22]:
# Dropping column irrelavent for model training and testing
joined_df.drop(columns=['lead_uuid','offer_id','lender_id','clicked_at','offer_id_2','loan_purpose','credit'],inplace=True)

In [23]:
X=joined_df.drop(columns=['output'])
y=joined_df['output']

In [24]:
# Splitting into train and test set in 80% and 20% ratio.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42,stratify=y)

In [25]:
#Fitting the Logistic Regression on train set and scoring it 
clf = LogisticRegression(random_state=0).fit(X_train, y_train)

In [None]:
with open("train_model","wb") as f:
    pickle.dump(clf,f)