In [None]:
#Business case of this portfolio is to find the average spend of a customer in Trivalley and the other one is to predict whether the customer is gonna be a buyer or not.
#this type of situations are common where there are two dependent variables(Target variables) to be predicted.
#To work on this data, I'm following below steps:
#Data Collection.
#perfoeming EDA including both the predictors in data set.
#performing EDA by only one prediictor at a time.
#Building Visuals to understand the trends of the store.
#data preparation==> avoiding noice variables , dealing missing values and outliers.
#Statistical analysis.
#creating and comparing models and picking up the better model.
#Testing the model with various methods and finalyzing it.

In [None]:
#reading the dataset location into url variable.
url="https://raw.githubusercontent.com/lokidiffender/Trivalley-Bike-Store-Analysis-Prediction-Model/main/Trivalley%20Customer%20Spend%20and%20Buyer%20Flag%20Data.csv"
url

In [None]:
#importing pandas
import pandas as pd

#reading dataset into data variable
data=pd.read_csv(url)

In [None]:
#displaying the data
data


In [None]:
#checking size of data.
data.shape


In [None]:
#looking data and its features.
data.info()


In [None]:
# Displaying descriptive stats of the dataset
data.describe()

In [None]:
# Installing dataprep package for data profiling
!pip install dataprep

In [None]:
# Importing the dataprep package
from dataprep.eda import create_report

In [None]:
create_report(data).save("Trivalley_DataPrep_EDA_report.html")

In [None]:
#Checking data columns
data.columns

In [None]:
!pip install sketch

In [None]:
!pip install rich

In [None]:
from rich import print
import os
os.environ["SKETCH_MAX_COLUMNS"] = "26"

In [None]:
import sketch

data.sketch.howto("calculate the age based on current time zone with the birthdate.")

In [None]:
 #deriving the age using birthdate of the customer as birth date doesn't help in predicting .
 # Import necessary libraries
import pandas as pd
from datetime import datetime

# Convert BirthDate column to datetime format
data['BirthDate'] = pd.to_datetime(data['BirthDate'])

# Calculate age based on current time zone
data['Age'] = (datetime.now() - data['BirthDate']).astype('<m8[Y]')

# Print head of dataframe to check results
print(data.head())


In [None]:
data.columns

In [None]:
#Ignoring unwanted columns which are not impacting our predictors
#Before that I'm creating two datasets each with having only one predictor or target varible
data_avgspend=data[['CustomerID', 'AveMonthSpend','Title', 'FirstName',
       'MiddleName', 'LastName', 'Suffix', 'AddressLine1', 'AddressLine2',
       'City', 'StateProvinceName', 'CountryRegionName', 'PostalCode',
       'PhoneNumber', 'BirthDate', 'Education', 'Occupation', 'Gender',
       'MaritalStatus', 'HomeOwnerFlag', 'NumberCarsOwned',
       'NumberChildrenAtHome', 'TotalChildren', 'YearlyIncome', 'Age']]
data_buyer_ornot=[['CustomerID','BikeBuyer', 'Title', 'FirstName',
       'MiddleName', 'LastName', 'Suffix', 'AddressLine1', 'AddressLine2',
       'City', 'StateProvinceName', 'CountryRegionName', 'PostalCode',
       'PhoneNumber', 'BirthDate', 'Education', 'Occupation', 'Gender',
       'MaritalStatus', 'HomeOwnerFlag', 'NumberCarsOwned',
       'NumberChildrenAtHome', 'TotalChildren', 'YearlyIncome','Age']]

# **Case-1: Regression with Average Monthly spend**

In [None]:
#Using data_avgspend=data
# Installing sweetviz package for EDA
!pip install sweetviz

In [None]:
# Importing the sv package
import sweetviz as sv

In [None]:
# Generating EDA report using sv package
sv_report = sv.analyze(data_avgspend)

# Converting the report to html format
sv_report.show_html("Sweetviz_Report_Of_Trivalley_Sales.html")

In [None]:
!pip install -U --pre pycaret

In [None]:
import numpy as np
import pandas as pd

In [None]:
data_avgspend.corr()

In [None]:
data_avgspend.info()

In [None]:
data_avgspend.columns

In [None]:
#creating a target variable 'Y' with standard score of the dataset.
y="AveMonthSpend"
# Setting the noise/redundant variables & variables with more than 20% missing data
ignored_cols = ['CustomerID','Title', 'FirstName', 'MiddleName',
       'LastName', 'Suffix', 'AddressLine1', 'AddressLine2','PostalCode', 'PhoneNumber',
       'BirthDate']
categorical_cols=['City',
       'StateProvinceName', 'CountryRegionName',
       'Education', 'Occupation', 'Gender', 'MaritalStatus']
numerical_cols=['HomeOwnerFlag', 'NumberCarsOwned', 'NumberChildrenAtHome',
       'TotalChildren', 'YearlyIncome', 'Age']


In [None]:
from pycaret.regression import *

In [None]:
# Setting/configuring the pycaret ML experiment
regression_setup = setup(data_avgspend,
                         target = y,
                         categorical_features = categorical_cols,
                         numeric_features = numerical_cols,
                         ignore_features = ignored_cols)

In [None]:
compare_models()

In [None]:
#Building the model using the best algo from compare.models() function
model=create_model('lightgbm')


In [None]:
# Using the algo to predict values(Average spend)
predicted_avg_spendings=predict_model(model,data)

In [None]:
#downloading the avg spendings predicted
predicted_avg_spendings.to_csv("Trivalley_Avg_Spend_Predicted_Scores.csv")

In [None]:
#predicting Average spends for a client file
client_data=pd.read_csv('https://raw.githubusercontent.com/nvamsimohan/DallasDSA/main/Trivalley%20Client%20data%20file.csv')
client_data.info()
# Convert BirthDate column to datetime format
client_data['BirthDate'] = pd.to_datetime(client_data['BirthDate'])

# Calculate age based on current time zone
client_data['Age'] = (datetime.now() - client_data['BirthDate']).astype('<m8[Y]')

# Print head of dataframe to check results
print(client_data.head())

In [None]:
client_predictions = predict_model(model, client_data)
client_predictions.to_csv("Trivalley_Client_Average_spend_predicted_results.csv")