# Projeto Final - Machine Learning:
#### Integrantes: 
    - Natália Queiroz Menezes Carreras
    - Willian Kenzo Asanuma Lee
#### Database: https://www.kaggle.com/datasets/mehmettahiraslan/customer-shopping-dataset

#### Bibliotecas necessárias:

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import pathlib
import pickle
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

#### Importando o Dataset:

In [2]:
filename = pathlib.Path.cwd()/'customer_shopping_data.csv'
data = pd.read_csv(filename)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99457 entries, 0 to 99456
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   invoice_no      99457 non-null  object 
 1   customer_id     99457 non-null  object 
 2   gender          99457 non-null  object 
 3   age             99457 non-null  int64  
 4   category        99457 non-null  object 
 5   quantity        99457 non-null  int64  
 6   price           99457 non-null  float64
 7   payment_method  99457 non-null  object 
 8   invoice_date    99457 non-null  object 
 9   shopping_mall   99457 non-null  object 
dtypes: float64(1), int64(2), object(7)
memory usage: 7.6+ MB


#### Analisando o dataset:

##### Category:

In [None]:
data['category'] = data['category'].astype('category')
data['category'].value_counts()

In [None]:
labels = ['Clothing','Cosmetics','Food & Beverage','Toys','Shoes','Souvenir','Technology','Books']
sizes = [data['category'].value_counts()[0],data['category'].value_counts()[1],data['category'].value_counts()[2],data['category'].value_counts()[3],data['category'].value_counts()[4],data['category'].value_counts()[5],data['category'].value_counts()[6],data['category'].value_counts()[7]]
colors=['#fbf8cc','#fde4cf','#ffcfd2','#f1c0e8','#cfbaf0','#a3c4f3','#90dbf4','#8eecf5','#98f5e1','#b9fbc0']
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.2f%%',
        startangle=90, colors=colors)
ax1.axis('equal')
plt.show()

##### Gender:

In [None]:
data['gender'] = data['gender'].astype('category')
data['gender'].value_counts()

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,0.5,1])
titles = ['Female', 'Male']
qnt = [data['gender'].value_counts()[0],data['gender'].value_counts()[1]]
ax.bar(titles,qnt, color=['#F4796B','#A9DEF9'], width=0.5)
ax.set_facecolor("black")
plt.grid(axis = 'y')
plt.title('Genders')
plt.show()


##### Payment method:

In [None]:
data['payment_method'] = data['payment_method'].astype('category')
data['payment_method'].value_counts()

In [None]:
labels = 'Cash', 'Credit Card', 'Debit Card'
sizes = [data['payment_method'].value_counts()[0],data['payment_method'].value_counts()[1],data['payment_method'].value_counts()[2]]
colors = ['#F4796B','#A9DEF9','#DDFCAD']
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%',
        startangle=90, colors=colors)
ax1.axis('equal')
plt.show()

##### Shopping mall:

In [None]:
data['shopping_mall'] = data['shopping_mall'].astype('category')
data['shopping_mall'].value_counts()

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,2,1])
colors=['#fbf8cc','#fde4cf','#ffcfd2','#f1c0e8','#cfbaf0','#a3c4f3','#90dbf4','#8eecf5','#98f5e1','#b9fbc0']
titles = ['Mall of Istanbul', 'Kanyon','Metrocity','Metropol AVM','Istinye Park','Zorlu Center','Cevahir AVM','Forum Istanbul','Viaport Outlet','Emaar Square Mall']
qnt = [data['shopping_mall'].value_counts()[0],data['shopping_mall'].value_counts()[1],data['shopping_mall'].value_counts()[2],data['shopping_mall'].value_counts()[3],data['shopping_mall'].value_counts()[4],data['shopping_mall'].value_counts()[5],data['shopping_mall'].value_counts()[6],data['shopping_mall'].value_counts()[7],data['shopping_mall'].value_counts()[8],data['shopping_mall'].value_counts()[9]]
ax.bar(titles,qnt, color=colors, width=0.5)
ax.set_facecolor("black")
plt.grid(axis = 'y')
plt.title('Shopping Mall')
plt.show()

##### Age, quantity and price:

In [None]:
data.select_dtypes('number').describe().transpose()

In [None]:
#under 20
_0a19_, _20a29_,_30a39_,_40a49_,_50a59_,_60a69_,_70_ = 0,0,0,0,0,0,0

x = data['age'].value_counts()
x = np.random.normal(0, 10, 250)
for value in range (len(data)):
    if data['age'].get(value) < 20:
        _0a19_ += 1
    elif 20 <= (data['age'].get(value)) < 30:
        _20a29_ += 1
    elif 30 <= (data['age'].get(value)) < 40:
        _30a39_ += 1
    elif 40 <= (data['age'].get(value)) < 50:
        _40a49_ += 1
    elif 50 <= (data['age'].get(value)) < 60:
        _50a59_ += 1
    elif 60 <= (data['age'].get(value)) < 70:
        _60a69_ += 1
    else:
        _70_ += 1
        
colors=['#fbf8cc','#fde4cf','#ffcfd2','#f1c0e8','#cfbaf0','#a3c4f3','#90dbf4','#8eecf5','#98f5e1','#b9fbc0']
titles = ['0 a 19', '20 a 29','30 a 39','40 a 49','50 a 59','60 a 69','70+']
qnt = [_0a19_, _20a29_,_30a39_,_40a49_,_50a59_,_60a69_,_70_]

fig = plt.figure()
ax = fig.add_axes([0,0,2,1])
ax.bar(titles,qnt, color=colors, width=0.5)
plt.grid(axis = 'y')
ax.set_facecolor("black")
plt.title('Purchases by month')


##### Date:

In [None]:
data['quantity'] = data['quantity'].astype('category')
data['price'] = data['price'].astype('category')

january,february,march,april,may,june,july,august,september,october,november,december = 0,0,0,0,0,0,0,0,0,0,0,0
januaryAmount,februaryAmount,marchAmount,aprilAmount,mayAmount,juneAmount,julyAmount,augustAmount,septemberAmount,octoberAmount,novemberAmount,decemberAmount = 0,0,0,0,0,0,0,0,0,0,0,0
for i in range (len(data)):
    dado = data['invoice_date'].get(i)
    dado = dado.split("/")
    if dado[1] == '1':
        januaryAmount += data['price'].get(i)*data['quantity'].get(i)
        january +=1
    elif dado[1] == '2':
        februaryAmount += data['price'].get(i)*data['quantity'].get(i)
        february += 1
    elif dado[1] == '3':
        marchAmount += data['price'].get(i)*data['quantity'].get(i)
        march +=1
    elif dado[1] == '4':
        aprilAmount += data['price'].get(i)*data['quantity'].get(i)
        april +=1
    elif dado[1] == '5':
        mayAmount += data['price'].get(i)*data['quantity'].get(i)
        may +=1
    elif dado[1] == '6':
        juneAmount += data['price'].get(i)*data['quantity'].get(i)
        june +=1
    elif dado[1] == '7':
        julyAmount += data['price'].get(i)*data['quantity'].get(i)
        july +=1
    elif dado[1] == '8':
        augustAmount += data['price'].get(i)*data['quantity'].get(i)
        august +=1
    elif dado[1] == '9':
        septemberAmount += data['price'].get(i)*data['quantity'].get(i)
        september +=1
    elif dado[1] == '10':
        octoberAmount += data['price'].get(i)*data['quantity'].get(i)
        october +=1
    elif dado[1] == '11':
        novemberAmount += data['price'].get(i)*data['quantity'].get(i)
        november +=1
    elif dado[1] == '12':
        decemberAmount +=data['price'].get(i)*data['quantity'].get(i)
        december +=1



colors=['#fbf8cc','#fde4cf','#ffcfd2','#f1c0e8','#cfbaf0','#a3c4f3','#90dbf4','#8eecf5','#98f5e1','#b9fbc0']
titles = ['January', 'February','March','April','May','June','July','August','September','October', 'November','December']
qnt = [january,february,march,april,may,june,july,august,september,october,november,december]
qntAmount = [januaryAmount,februaryAmount,marchAmount,aprilAmount,mayAmount,juneAmount,julyAmount,augustAmount,septemberAmount,octoberAmount,novemberAmount,decemberAmount]

fig = plt.figure()
ax = fig.add_axes([0,0,2,1])
ax.bar(titles,qnt, color=colors, width=0.5)
plt.grid(axis = 'y')
ax.set_facecolor("black")
plt.title('Purchases by month')


fig2 = plt.figure()
ax2 = fig2.add_axes([0,0,2,1])
ax2.bar(titles,qntAmount, color=colors, width=0.5)
ax2.set_facecolor("black")
plt.title('Amount gained on sales ($)')
plt.grid(axis = 'y')

plt.show()

#### Treinando as variáveis:

In [3]:
ignore_variables = [
    'customer_id',
    'invoice_date',
]
categorical_variables = [
    'gender',
    'category',
    'payment_method',
    'shopping_mall',  
]
continuous_variables = [
    'age',
    'quantity',
    'price'
]
discrete_variables = [

]

In [5]:
dt= data.copy()
dt.drop(columns=ignore_variables, inplace=True)

In [8]:
for col in continuous_variables:
    dt[col] = dt[col].astype('float64')
for col in categorical_variables:
    dt[col] = dt[col].astype('category')

In [9]:
dt \
    .select_dtypes('category') \
    .describe() \
    .transpose() \
    .sort_values(by='count', ascending=True)

Unnamed: 0,count,unique,top,freq
gender,99457,2,Female,59482
category,99457,8,Clothing,34487
payment_method,99457,3,Cash,44447
shopping_mall,99457,10,Mall of Istanbul,19943


In [10]:
dt \
    .select_dtypes('number') \
    .describe() \
    .transpose() \
    .sort_values(by='count', ascending=True)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,99457.0,43.427089,14.990054,18.0,30.0,43.0,56.0,69.0
quantity,99457.0,3.003429,1.413025,1.0,2.0,3.0,4.0,5.0
price,99457.0,689.256321,941.184567,5.23,45.45,203.3,1200.32,5250.0


In [11]:
summary = data[categorical_variables].describe().transpose().sort_values(by='count')
summary

Unnamed: 0,count,unique,top,freq
gender,99457,2,Female,59482
category,99457,8,Clothing,34487
payment_method,99457,3,Cash,44447
shopping_mall,99457,10,Mall of Istanbul,19943


In [None]:
model_data = pd.get_dummies(data,drop_first=False)

In [None]:
dt = data.copy()
X = dt.drop(columns=['price','invoice_no','customer_id','gender']).copy().values
y = dt['price'].copy().values


In [None]:
RANDOM_SEED = 33  # Any number here, really.
Xtrain, Xtest, ytrain, ytest = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=RANDOM_SEED,
)

In [None]:
X.shape, Xtrain.shape, Xtest.shape

In [None]:
y.shape, ytrain.shape, ytest.shape

In [None]:
model = LinearRegression()
model.fit(Xtrain, ytrain)


In [None]:
ypred = model.predict(Xtest)
RMSE = np.sqrt(mean_squared_error(ytest, ypred))
RMSE
