## Customer-Segmentation-Small-Project

In [1]:
# pip install pandas numpy matplotlib seaborn nltk sklearn wordcloud plotly

Collecting sklearn
  Using cached sklearn-0.0.post10-py3-none-any.whl
Collecting wordcloud
  Obtaining dependency information for wordcloud from https://files.pythonhosted.org/packages/34/ac/72a4e42e76bf549dfd91791a6b10a9832f046c1d48b5e778be9ec012aa47/wordcloud-1.9.2-cp311-cp311-win_amd64.whl.metadata
  Downloading wordcloud-1.9.2-cp311-cp311-win_amd64.whl.metadata (3.4 kB)
Using cached wordcloud-1.9.2-cp311-cp311-win_amd64.whl (151 kB)
Installing collected packages: sklearn, wordcloud
Successfully installed sklearn-0.0.post10 wordcloud-1.9.2
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib as mpl
import seaborn as sns
import nltk
import warnings
import itertools
import datetime
from pathlib import Path
from sklearn import preprocessing, cluster, model_selection, metrics, svm, ensemble, decomposition, linear_model, tree
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV, learning_curve
from sklearn.metrics import silhouette_samples, silhouette_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA
from wordcloud import WordCloud, STOPWORDS
from IPython.display import display, HTML
import plotly.graph_objs as go
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot

In [32]:
py.init_notebook_mode(connected=True) #specific to Plotly. configures Plotly to work in the Jupyter notebook environment.
warnings.filterwarnings("ignore") #sets up a filter to suppress certain types of warnings in the notebook.
plt.rcParams["patch.force_edgecolor"] = True #configuration for Matplotlib. It sets the edge color for patches (e.g., bars in a bar chart) to be more visible.
plt.style.use('fivethirtyeight') #sets the style for Matplotlib plots. In this case, it's using the 'fivethirtyeight' style, which emulates the style of graphics used by the website FiveThirtyEight
mpl.rc('patch', edgecolor = 'dimgray', linewidth=1) #configuration for Matplotlib, appearance of elements like bars in a plot

### 1. Data preparation

The encoding="ISO-8859-1" argument specifies the character encoding used in the file. ISO-8859-1 (also known as Latin-1) is a commonly used encoding for files that contain characters from Western European languages.  
This dataframe contains 8 variables that correspond to:  
**InvoiceNo**: Invoice number. Nominal, a 6-digit integral number uniquely assigned to each transaction. If this code starts with letter 'c', it indicates a cancellation.  <br>
**StockCode**: Product (item) code. Nominal, a 5-digit integral number uniquely assigned to each distinct product. <br>
**Description**: Product (item) name. Nominal. <br>
**Quantity**: The quantities of each product (item) per transaction. Numeric.	<br>
**InvoiceDate**: Invoice Date and time. Numeric, the day and time when each transaction was generated. <br>
**UnitPrice**: Unit price. Numeric, Product price per unit in sterling. <br>
**CustomerID**: Customer number. Nominal, a 5-digit integral number uniquely assigned to each customer. <br>
**Country**: Country name. Nominal, the name of the country where each customer resides.<br>

In [21]:
# read the datafile
df_initial = pd.read_csv('data.csv',encoding="ISO-8859-1",
                         dtype={'CustomerID': str,'InvoiceID': str})
print('Dataframe dimensions:', df_initial.shape)

Dataframe dimensions: (541909, 8)


#### Step-1

In [8]:
# show first 10 lines and data frame info
display(df_initial.head(10))
display(df_initial.info())

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850,United Kingdom
5,536365,22752,SET 7 BABUSHKA NESTING BOXES,2,12/1/2010 8:26,7.65,17850,United Kingdom
6,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,12/1/2010 8:26,4.25,17850,United Kingdom
7,536366,22633,HAND WARMER UNION JACK,6,12/1/2010 8:28,1.85,17850,United Kingdom
8,536366,22632,HAND WARMER RED POLKA DOT,6,12/1/2010 8:28,1.85,17850,United Kingdom
9,536367,84879,ASSORTED COLOUR BIRD ORNAMENT,32,12/1/2010 8:34,1.69,13047,United Kingdom


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  object 
 7   Country      541909 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 33.1+ MB


None

#### Step-2

In [22]:
# Drop null values
df_dropna = df_initial.dropna()
# Show no.of rows before and after the action.
print(f'Number of rows before dropping null values: {df_initial.shape[0]}')
print(f'Number of rows after dropping null values: {df_dropna.shape[0]}')

Number of rows before dropping null values: 541909
Number of rows after dropping null values: 406829


#### Step-3

In [23]:
# Drop Duplicate Rows.
df_drop_duplicates = df_dropna.drop_duplicates()
# Show no.of rows before and after the action.
print(f'Number of rows before dropping duplicate rows: {df_dropna.shape[0]}')
print(f'Number of rows after dropping duplicate rows: {df_drop_duplicates.shape[0]}')
df_cleaned = df_drop_duplicates

Number of rows before dropping duplicate rows: 406829
Number of rows after dropping duplicate rows: 401604


#### Step-4

In [30]:
# Countries where orders are made 
# df_cleaned[['CustomerID', 'InvoiceNo', 'Country']] is a new DataFrame with only three columns
# The following line counts the number of rows with unique combinations of CustomerID, InvoiceNo, and Country and the result is stored in a new DataFrame called temp.
temp = df_cleaned[['CustomerID', 'InvoiceNo', 'Country']].groupby(['CustomerID', 'InvoiceNo', 'Country']).count()
temp = temp.reset_index(drop = False)
countries = temp['Country'].value_counts() #The result is a Series where the index represents unique countries and the values represent the count of orders made from each country.

Index(['United Kingdom', 'Germany', 'France', 'EIRE', 'Belgium', 'Spain',
       'Netherlands', 'Switzerland', 'Portugal', 'Australia', 'Italy',
       'Finland', 'Sweden', 'Norway', 'Channel Islands', 'Japan', 'Poland',
       'Denmark', 'Cyprus', 'Austria', 'Singapore', 'Malta', 'Unspecified',
       'USA', 'Iceland', 'Israel', 'Canada', 'Greece', 'Czech Republic',
       'European Community', 'Lithuania', 'United Arab Emirates',
       'Saudi Arabia', 'Bahrain', 'RSA', 'Lebanon', 'Brazil'],
      dtype='object')


In [38]:
# Plot Choropleth World Map displaying the number of orders per country. 
# It specifies the data, layout, and color scale for the map using Plotly's API. 
# Finally, it uses the Plotly offline module to display the map.

# defining a dictionary called data which contains various parameters needed to create the choropleth map.
data = dict(
            type='choropleth', #specifies that we're creating a choropleth map.
            locations = countries.index, # .index is used to access the index labels of countries Series, i.e. the country names
            locationmode = 'country names', # tells Plotly to interpret the values in locations as country names
            z = countries, #sets the values associated with each country, which will determine the color on the map. In this case, it's the number of orders.
            text = countries.index, #provides the text that will be displayed when one hovers over each country.
            colorbar = {'title':'Order nb.'}, #sets the title for the colorbar.
            colorscale=[
                        [0, 'rgb(224,255,255)'],
                        [0.01, 'rgb(166,206,227)'], 
                        [0.02, 'rgb(31,120,180)'],
                        [0.03, 'rgb(178,223,138)'], 
                        [0.05, 'rgb(51,160,44)'],
                        [0.10, 'rgb(251,154,153)'], 
                        [0.20, 'rgb(255,255,0)'],
                        [1, 'rgb(227,26,28)']
                       ],
            reversescale = False #specifies that the colors should not be reversed.
        )

# defining a dictionary called layout which contains parameters related to the layout and appearance of the map.
layout = dict(
                title='Number of orders per country',
                geo = dict(showframe = True, projection={'type':'mercator'})
            )

# creating a Figure object using the provided data and layout. 
choromap = go.Figure(data = [data], layout = layout)
iplot(choromap)

#### Step-5

In [None]:
# Exploring the Cancelling Orders, Stock Code and price. Try to understand the patterns in each column. Make any dataframe edits necessary
