<a href="https://colab.research.google.com/github/lolatelo/implementations/blob/main/lightgbm_social_network_ads.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading data
follow along: https://medium.com/@vaishnaviy502/light-gbm-c11e06b53479


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# Authenticate to Google API
from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
creds, _ = default()

gc = gspread.authorize(creds)

# Open the Google Sheet using its URL
spreadsheet_url = 'https://docs.google.com/spreadsheets/d/1FpAdPPUDPVrKkVC0nr0c53e2QQVQuHE3erjyM1kk3GA/edit#gid=2134510344'

# Open the first sheet
worksheet = gc.open_by_url(spreadsheet_url).sheet1

# Get all the values in the sheet
rows = worksheet.get_all_values()

# Convert to a DataFrame
data = pd.DataFrame.from_records(rows)

# Display the DataFrame
print(data)

In [None]:
headers = rows[0]
data_rows = rows[1:]

# Create a DataFrame using the data rows and specify the column headers
data = pd.DataFrame(data_rows, columns=headers)

# Display the DataFrame to check it
print(data.head())

    User ID  Gender Age EstimatedSalary Purchased
0  15624510    Male  19           19000         0
1  15810944    Male  35           20000         0
2  15668575  Female  26           43000         0
3  15603246  Female  27           57000         0
4  15804002    Male  19           76000         0


# Prepping data

In [None]:
dataset = data

In [None]:
dataset.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [None]:
# create features and tarrget variable
X = dataset.iloc[:, [2, 3]].values
y = dataset.iloc[:, 4].values

In [None]:
X[:5]

array([['19', '19000'],
       ['35', '20000'],
       ['26', '43000'],
       ['27', '57000'],
       ['19', '76000']], dtype=object)

In [None]:
y[:5]

array(['0', '0', '0', '0', '0'], dtype=object)

In [None]:
# split the data into test and training data
# Tree-based learning methods are least bothered whether the training data is scaled or not. So, it is usually optional to scale the data.
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [None]:
x_train[:5]

array([['44', '39000'],
       ['32', '120000'],
       ['38', '50000'],
       ['32', '135000'],
       ['52', '21000']], dtype=object)

# Model building and training
- We need to convert our training data into LightGBM dataset format by creating a **python dictionary with parameters and their values**
- And then...train!

In [None]:
import lightgbm as lgb
d_train = lgb.Dataset(x_train, label=y_train)
params = {}
params['learning_rate'] = 0.003
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'        # the loss function to be minimized during training
params['metric'] = 'binary_logloss'   # used to evaluate model performance; does not influence the model's learning process
params['sub_feature'] = 0.5
params['num_leaves'] = 10
params['min_data'] = 50
params['max_depth'] = 10

clf = lgb.train(params, d_train, 1000)

In [None]:
params

{'learning_rate': 0.003,
 'boosting_type': 'gbdt',
 'objective': 'binary',
 'metric': 'binary_logloss',
 'sub_feature': 0.5,
 'num_leaves': 10,
 'min_data': 50,
 'max_depth': 10}

In [None]:
# model prediction
y_pred=clf.predict(x_test)
#convert into binary values
#Probabilities are converted to binary prediction keeping threshold=0.5
for i in range(0,100):
    if y_pred[i]>=.5:       # setting threshold to .5
       y_pred[i]=1
    else:
       y_pred[i]=0

In [None]:
# change the data type of y_pred
# The data types of inputs in the Confusion metric should be the same else it will raise an error: Classification metrics can’t handle a mix of binary and continuous targets.
y_pred = y_pred.astype(int)
y_test = y_test.astype(int)

# Results

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [None]:
cm

array([[64,  4],
       [ 5, 27]])

In [None]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_pred,y_test, normalize=True)
accuracy

0.91

Even though the dataset was small the accuracy is 91% and the model is not overfitted. This only happened because of finely tuned model parameters.

**Parameter Tuning** is one of the major aspects to improve your model.