# My Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")
import datetime as dt

# Data Wrangling

In [None]:
df=pd.read_excel('D:/sql/Data set/Retail/Online Retail.xlsx')
df.head()

# Data Preparing

## changing all 0 unit price to the mode price for every category

In [None]:
zero_G=df.loc[(df['UnitPrice']==0)&(df['CustomerID'].notnull()==True),['Description']]

In [None]:
a=list(zero_G['Description'])

In [None]:
for i in a:
    
    description_match = df['Description'] == i
    mode_price = df[description_match]['UnitPrice'].mode().iloc[0]
    df.loc[description_match & (df['UnitPrice'] == 0), 'UnitPrice'] = mode_price

## adding total price column

In [None]:
df['TotalPrice']=df['UnitPrice']*df['Quantity']

In [None]:
df.info()

# Data Cleaning

## Removing non customer & cancelled transactions

In [None]:
df2=df.loc[(df['Quantity']>0)&(df['CustomerID'].notnull()==True)]

In [None]:
df2.info()

In [None]:
pd.DataFrame(df2.apply(lambda col: len(col.unique())),columns=["Unique Values Count"])

## droping unimportant feature

In [None]:
df2.drop(['InvoiceNo','Description'],axis=1,inplace=True)

In [None]:
df2['StockCode'] = df2['StockCode'].astype(str)
df2['Country'] = df2['Country'].astype(str)

In [None]:
df2.info()

In [None]:
pd.DataFrame(df2.apply(lambda col: len(col.unique())),columns=["Unique Values Count"])

In [None]:
df['InvoiceDate'].max()

In [None]:
today_date = dt.datetime(2011, 12, 9)

In [None]:
df2['InvoiceDate'] = (today_date - df2['InvoiceDate']).dt.days

In [None]:
print(df2.isnull().sum())

In [None]:
df2.duplicated().sum()

In [None]:
df2.drop_duplicates(inplace = True)

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [None]:
label_encoder = LabelEncoder()
df2['StockCode'] = label_encoder.fit_transform(df2['StockCode'])
df2['Country'] = label_encoder.fit_transform(df2['Country'])


## grouping customer data

In [None]:
customer_data = df2.groupby('CustomerID').agg({'InvoiceDate': lambda date: date.min(),
                                    'Quantity': lambda num: num.sum(),
                                    'TotalPrice': lambda num: num.sum(),
                                    'Country': lambda num: num.mode()})                                  
customer_data

In [None]:
customer_data.reset_index(inplace=True)
customer_data

## Removing outliers

In [None]:
Q1 = customer_data.quantile(0.25)
Q3 = customer_data.quantile(0.75)
IQR = Q3 - Q1
outliers = ((customer_data < (Q1 - 1.5 * IQR)) | (customer_data > (Q3 + 1.5 * IQR))).any(axis=1)
cleaned_data = customer_data[~outliers]

In [None]:
correlation_matrix = df2.corr()
sns.heatmap(correlation_matrix, annot=True)
plt.show()

In [None]:
cleaned_data

In [None]:
print(customer_data.isnull().sum())

# Customer Segmentation by Neural Networks

In [None]:
X=cleaned_data.drop(['CustomerID'],axis=1)
y=cleaned_data['CustomerID']
y.head()

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)

In [None]:
X_train=X_train.astype(np.float32)
X_test=X_test.astype(np.float32)

In [None]:
cleaned_data=cleaned_data.astype(np.float32)

In [None]:
from keras.layers import Input, Dense
from keras.models import Model


input_layer = Input(shape=(4,))
encoded = Dense(4, activation='relu')(input_layer)
decoded = Dense(4, activation='sigmoid')(encoded)

autoencoder = Model(input_layer,decoded)
autoencoder.compile(optimizer='sgd', loss='mse')


In [None]:
autoencoder.fit(X,y, epochs=100, batch_size=256, shuffle=True, validation_split=0.3)

In [None]:
encoder = Model(input_layer, encoded)

In [None]:
compressed_data = encoder.predict(X)

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3)
clusters = kmeans.fit_predict(compressed_data)

In [None]:
cleaned_data['Segment'] = clusters

for cluster in np.unique(clusters):
    print(f"Cluster {cluster}:")
    print(cleaned_data[cleaned_data['Segment'] == cluster].describe())

# EDA after Segmentation

In [None]:
segment1 = cleaned_data[cleaned_data['Segment'] == 0]
segment2 = cleaned_data[cleaned_data['Segment'] == 1]
segment3 = cleaned_data[cleaned_data['Segment'] == 2]
# segment4 = cleaned_data[cleaned_data['Segment'] == 3]
plt.scatter(x=segment1['Quantity'], y=segment1['TotalPrice'], label=0, color='blue')
plt.scatter(x=segment2['Quantity'], y=segment2['TotalPrice'], label=1, color='red')
plt.scatter(x=segment3['Quantity'], y=segment3['TotalPrice'], label=2, color='yellow')
# plt.scatter(x=segment4['Quantity'], y=segment4['TotalPrice'], label=3, color='green')
plt.xlabel('Quantity')
plt.ylabel('TotalPrice')
plt.legend()

plt.show()

In [None]:
customer_seg = cleaned_data[['Segment','InvoiceDate','Quantity','TotalPrice']].groupby('Segment').agg(['count','mean','max','sum'])
customer_seg

In [None]:
customer_seg.sort_values(by=('TotalPrice','sum'), ascending=False)

In [None]:
customer_seg['TotalPrice']['sum'][2],customer_seg['TotalPrice']['sum'][0],customer_seg['TotalPrice']['sum'][1]

In [None]:
cleaned_data[cleaned_data['Segment'] == cluster].describe()

# Conclusions
## segment 1 are 63% of all customers & 28% of all quantity & 28% of all profits
## segment 0 are 25% of all customers & 38% of all quantity & 37% of all profits
## segment 2 are 12% of all customers & 34% of all quantity & 35% of all profits
