### Data Warehousing and data mining task

In [1]:
# importing the necessary libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Loading the dataset

In [2]:
# Loading the dataset into a pandas dataframe, skipping problematic rows on_bad_lines='skip',
data = pd.read_csv("sales_data_cleaned.csv", encoding = 'unicode_escape')
data.head(10)

Unnamed: 0,ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,ORDERDATE,STATUS,QTR_ID,MONTH_ID,YEAR_ID,...,ADDRESSLINE1,ADDRESSLINE2,CITY,STATE,POSTALCODE,COUNTRY,TERRITORY,CONTACTLASTNAME,CONTACTFIRSTNAME,DEALSIZE
0,10107,30,95.7,2,2871.0,2/24/2003 0:00,Shipped,1,2,2003,...,897 Long Airport Avenue,,NYC,NY,10022,USA,,Yu,Kwai,Small
1,10121,34,81.35,5,2765.9,5/7/2003 0:00,Shipped,2,5,2003,...,59 rue de l'Abbaye,,Reims,,51100,France,EMEA,Henriot,Paul,Small
2,10134,41,94.74,2,3884.34,7/1/2003 0:00,Shipped,3,7,2003,...,27 rue du Colonel Pierre Avia,,Paris,,75508,France,EMEA,Da Cunha,Daniel,Medium
3,10145,45,83.26,6,3746.7,8/25/2003 0:00,Shipped,3,8,2003,...,78934 Hillside Dr.,,Pasadena,CA,90003,USA,,Young,Julie,Medium
4,10159,49,100.0,14,5205.27,10/10/2003 0:00,Shipped,4,10,2003,...,7734 Strong St.,,San Francisco,CA,,USA,,Brown,Julie,Medium
5,10168,36,96.66,1,3479.76,10/28/2003 0:00,Shipped,4,10,2003,...,9408 Furth Circle,,Burlingame,CA,94217,USA,,Hirano,Juri,Medium
6,10180,29,86.13,9,2497.77,11/11/2003 0:00,Shipped,4,11,2003,...,"""184 chausse de Tournai""",,Lille,,59000,France,EMEA,Rance,Martine,Small
7,10188,48,100.0,1,5512.32,11/18/2003 0:00,Shipped,4,11,2003,...,"""Drammen 121 PR 744 Sentrum""",,Bergen,,N 5804,Norway,EMEA,Oeztan,Veysel,Medium
8,10201,22,98.57,2,2168.54,12/1/2003 0:00,Shipped,4,12,2003,...,5557 North Pendale Street,,San Francisco,CA,,USA,,Murphy,Julie,Small
9,10211,41,100.0,14,4708.44,1/15/2004 0:00,Shipped,1,1,2004,...,"""25 rue Lauriston""",,Paris,,75016,France,EMEA,Perrier,Dominique,Medium


### Data Understanding

In [3]:
# getting the columns of the dataset
data.columns

Index(['ORDERNUMBER', ' QUANTITYORDERED', ' PRICEEACH', ' ORDERLINENUMBER',
       ' SALES   ', ' ORDERDATE      ', ' STATUS    ', ' QTR_ID', ' MONTH_ID',
       ' YEAR_ID', ' PRODUCTLINE     ', ' MSRP', ' PRODUCTCODE',
       ' CUSTOMERNAME                        ', ' PHONE            ',
       ' ADDRESSLINE1                              ', ' ADDRESSLINE2',
       ' CITY          ', ' STATE        ', ' POSTALCODE', ' COUNTRY    ',
       ' TERRITORY', ' CONTACTLASTNAME', ' CONTACTFIRSTNAME', ' DEALSIZE'],
      dtype='object')

In [4]:
# random sampling of the dataset
data.sample(10)

Unnamed: 0,ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,ORDERDATE,STATUS,QTR_ID,MONTH_ID,YEAR_ID,...,ADDRESSLINE1,ADDRESSLINE2,CITY,STATE,POSTALCODE,COUNTRY,TERRITORY,CONTACTLASTNAME,CONTACTFIRSTNAME,DEALSIZE
1722,10175,37,31.12,2,1151.44,11/6/2003 0:00,Shipped,4,11,2003,...,35 King George,,London,,WX3 6FW,UK,EMEA,Brown,Ann,Small
187,10117,33,100.0,9,6034.38,4/16/2003 0:00,Shipped,2,4,2003,...,"""Bronz Sok. Bronz Apt. 3/6 Tesvikiye""",,Singapore,,79903,Singapore,Japan,Natividad,Eric,Medium
714,10259,34,99.41,7,3379.94,6/15/2004 0:00,Shipped,2,6,2004,...,Village Close - 106 Linden Road Sandown,2nd Floor,Singapore,,69045,Singapore,APAC,Victorino,Wendy,Medium
491,10304,37,95.55,13,3535.35,10/11/2004 0:00,Shipped,4,10,2004,...,"""67 avenue de l'Europe""",,Versailles,,78000,France,EMEA,Tonini,Daniel,Medium
1448,10220,26,56.07,8,1457.82,2/12/2004 0:00,Shipped,1,2,2004,...,25 Maiden Lane,Floor No. 4,Dublin,,2,Ireland,EMEA,Cassidy,Dean,Small
965,10150,26,100.0,10,2804.36,9/19/2003 0:00,Shipped,3,9,2003,...,"""Bronz Sok. Bronz Apt. 3/6 Tesvikiye""",,Singapore,,79903,Singapore,Japan,Natividad,Eric,Small
2091,10324,33,100.0,3,6267.69,11/5/2004 0:00,Shipped,4,11,2004,...,2678 Kingston Rd.,Suite 101,NYC,NY,10022,USA,,Frick,Michael,Medium
611,10239,46,73.92,4,3400.32,4/12/2004 0:00,Shipped,2,4,2004,...,Torikatu 38,,Oulu,,90110,Finland,EMEA,Koskitalo,Pirkko,Medium
2507,10401,11,100.0,8,1135.31,4/3/2005 0:00,On Hold,2,4,2005,...,7476 Moss Rd.,,Newark,NJ,94019,USA,,Brown,William,Small
2594,10177,32,76.62,1,2451.84,11/7/2003 0:00,Shipped,4,11,2003,...,"""Merchants House 27-30 Merchant's Quay""",,Madrid,,28023,Spain,EMEA,Fernandez,Jesus,Small


In [5]:
# preview the tail of the dataset
data.tail(10)

Unnamed: 0,ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,ORDERDATE,STATUS,QTR_ID,MONTH_ID,YEAR_ID,...,ADDRESSLINE1,ADDRESSLINE2,CITY,STATE,POSTALCODE,COUNTRY,TERRITORY,CONTACTLASTNAME,CONTACTFIRSTNAME,DEALSIZE
2813,10293,32,60.06,1,1921.92,9/9/2004 0:00,Shipped,3,9,2004,...,Via Monte Bianco 34,,Torino,,10100,Italy,EMEA,Accorti,Paolo,Small
2814,10306,35,59.51,6,2082.85,10/14/2004 0:00,Shipped,4,10,2004,...,Fauntleroy Circus,,Manchester,,EC2 5NT,UK,EMEA,Ashworth,Victoria,Small
2815,10315,40,55.69,5,2227.6,10/29/2004 0:00,Shipped,4,10,2004,...,"""67 rue des Cinquante Otages""",,Nantes,,44000,France,EMEA,Labrune,Janine,Small
2816,10327,37,86.74,4,3209.38,11/10/2004 0:00,Resolved,4,11,2004,...,Vinb'ltet 34,,Kobenhavn,,1734,Denmark,EMEA,Petersen,Jytte,Medium
2817,10337,42,97.16,5,4080.72,11/21/2004 0:00,Shipped,4,11,2004,...,5905 Pompton St.,Suite 750,NYC,NY,10022,USA,,Hernandez,Maria,Medium
2818,10350,20,100.0,15,2244.4,12/2/2004 0:00,Shipped,4,12,2004,...,"""C/ Moralzarzal 86""",,Madrid,,28034,Spain,EMEA,Freyre,Diego,Small
2819,10373,29,100.0,1,3978.51,1/31/2005 0:00,Shipped,1,1,2005,...,Torikatu 38,,Oulu,,90110,Finland,EMEA,Koskitalo,Pirkko,Medium
2820,10386,43,100.0,4,5417.57,3/1/2005 0:00,Resolved,1,3,2005,...,"""C/ Moralzarzal 86""",,Madrid,,28034,Spain,EMEA,Freyre,Diego,Medium
2821,10397,34,62.24,1,2116.16,3/28/2005 0:00,Shipped,1,3,2005,...,1 rue Alsace-Lorraine,,Toulouse,,31000,France,EMEA,Roulet,Annette,Small
2822,10414,47,65.52,9,3079.44,5/6/2005 0:00,On Hold,2,5,2005,...,8616 Spinnaker Dr.,,Boston,MA,51003,USA,,Yoshido,Juri,Medium


In [6]:
# getting more information on the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2823 entries, 0 to 2822
Data columns (total 25 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   ORDERNUMBER                                  2823 non-null   int64  
 1    QUANTITYORDERED                             2823 non-null   int64  
 2    PRICEEACH                                   2823 non-null   float64
 3    ORDERLINENUMBER                             2823 non-null   int64  
 4    SALES                                       2823 non-null   float64
 5    ORDERDATE                                   2823 non-null   object 
 6    STATUS                                      2823 non-null   object 
 7    QTR_ID                                      2823 non-null   int64  
 8    MONTH_ID                                    2823 non-null   int64  
 9    YEAR_ID                                     2823 non-null   int64  
 10  

### Data Cleaning

In [7]:
## check for missing values
data.isna().sum()

ORDERNUMBER                                    0
 QUANTITYORDERED                               0
 PRICEEACH                                     0
 ORDERLINENUMBER                               0
 SALES                                         0
 ORDERDATE                                     0
 STATUS                                        0
 QTR_ID                                        0
 MONTH_ID                                      0
 YEAR_ID                                       0
 PRODUCTLINE                                   0
 MSRP                                          0
 PRODUCTCODE                                   0
 CUSTOMERNAME                                  0
 PHONE                                         0
 ADDRESSLINE1                                  0
 ADDRESSLINE2                                  0
 CITY                                          0
 STATE                                         0
 POSTALCODE                                    0
 COUNTRY            

In [8]:
data.columns.str.strip()

Index(['ORDERNUMBER', 'QUANTITYORDERED', 'PRICEEACH', 'ORDERLINENUMBER',
       'SALES', 'ORDERDATE', 'STATUS', 'QTR_ID', 'MONTH_ID', 'YEAR_ID',
       'PRODUCTLINE', 'MSRP', 'PRODUCTCODE', 'CUSTOMERNAME', 'PHONE',
       'ADDRESSLINE1', 'ADDRESSLINE2', 'CITY', 'STATE', 'POSTALCODE',
       'COUNTRY', 'TERRITORY', 'CONTACTLASTNAME', 'CONTACTFIRSTNAME',
       'DEALSIZE'],
      dtype='object')

In [9]:
# checking for consistency
## checking for duplicated values 

data.duplicated().sum()

0

### Exercise 1 Data splitting

#### 1. Create Orders Dataset 

In [10]:
# Select columns for orders, including foreign keys for product and customer.

# Strip leading spaces from column names
data.columns = data.columns.str.strip()

orders_cols = [
    'ORDERNUMBER', 'ORDERDATE', 'SALES', 'STATUS', 'QTR_ID', 'MONTH_ID', 'YEAR_ID',
    'ORDERLINENUMBER', 'QUANTITYORDERED', 'PRICEEACH', 'PRODUCTCODE', 'CUSTOMERNAME'
]
orders_df = data[orders_cols]

orders_df.head(10)

# Save orders to CSV
orders_df.to_csv('Exercise1_data/orders.csv', index=False)

In [11]:
orders_df.shape

(2823, 12)

In [12]:
orders_df.head(10)

Unnamed: 0,ORDERNUMBER,ORDERDATE,SALES,STATUS,QTR_ID,MONTH_ID,YEAR_ID,ORDERLINENUMBER,QUANTITYORDERED,PRICEEACH,PRODUCTCODE,CUSTOMERNAME
0,10107,2/24/2003 0:00,2871.0,Shipped,1,2,2003,2,30,95.7,S10_1678,Land of Toys Inc.
1,10121,5/7/2003 0:00,2765.9,Shipped,2,5,2003,5,34,81.35,S10_1678,Reims Collectables
2,10134,7/1/2003 0:00,3884.34,Shipped,3,7,2003,2,41,94.74,S10_1678,Lyon Souveniers
3,10145,8/25/2003 0:00,3746.7,Shipped,3,8,2003,6,45,83.26,S10_1678,Toys4GrownUps.com
4,10159,10/10/2003 0:00,5205.27,Shipped,4,10,2003,14,49,100.0,S10_1678,Corporate Gift Ideas Co.
5,10168,10/28/2003 0:00,3479.76,Shipped,4,10,2003,1,36,96.66,S10_1678,Technics Stores Inc.
6,10180,11/11/2003 0:00,2497.77,Shipped,4,11,2003,9,29,86.13,S10_1678,Daedalus Designs Imports
7,10188,11/18/2003 0:00,5512.32,Shipped,4,11,2003,1,48,100.0,S10_1678,Herkku Gifts
8,10201,12/1/2003 0:00,2168.54,Shipped,4,12,2003,2,22,98.57,S10_1678,Mini Wheels Co.
9,10211,1/15/2004 0:00,4708.44,Shipped,1,1,2004,14,41,100.0,S10_1678,Auto Canal Petit


In [13]:
orders_df.columns

Index(['ORDERNUMBER', 'ORDERDATE', 'SALES', 'STATUS', 'QTR_ID', 'MONTH_ID',
       'YEAR_ID', 'ORDERLINENUMBER', 'QUANTITYORDERED', 'PRICEEACH',
       'PRODUCTCODE', 'CUSTOMERNAME'],
      dtype='object')

#### Splitting based on years 

In [14]:
def split_orders_by_year(df: pd.DataFrame) -> dict:
    # Group the DataFrame by the 'YEAR_ID' column and create a dictionary of DataFrames.
    orders_by_year = {year: group.copy() for year, group in df.groupby('YEAR_ID')}
    return orders_by_year

In [15]:
# Splitting the orders DataFrame by year
orders_by_year = split_orders_by_year(orders_df)

# # Displaying the keys and corresponding DataFrames
# for year, df_year in orders_by_year.items():
#     print(f"Year: {year}")
#     print(df_year, "\n")

In [16]:
orders_by_year.keys()

dict_keys([2003, 2004, 2005])

In [17]:
from pathlib import Path

Exercise1_data1 = Path("Exercise1_data1")
Exercise1_data1.mkdir(exist_ok=True) 

# Save each year's DataFrame as a CSV file
for year, df_year in orders_by_year.items():
    filename = f"orders_{year}.csv"
    df_year.to_csv(Exercise1_data1 / filename, index=False)
    print(f"Saved {filename}")

Saved orders_2003.csv
Saved orders_2004.csv
Saved orders_2005.csv


### 2. Create Customers Dataset 

In [20]:
# Select customer columns. Drop duplicates if customers appear in multiple rows.

customers_cols = ['CUSTOMERNAME', 'PHONE', 'CONTACTLASTNAME', 'CONTACTFIRSTNAME']

customers_df = data[customers_cols].drop_duplicates()

# Save customers to CSV
customers_df.to_csv('Exercise1_data/customers.csv', index=False)

In [21]:
customers_df.shape

(2823, 4)

In [22]:
customers_df.head(10)

Unnamed: 0,CUSTOMERNAME,PHONE,CONTACTLASTNAME,CONTACTFIRSTNAME
0,Land of Toys Inc.,2125557818,Yu,Kwai
1,Reims Collectables,26.47.1555,Henriot,Paul
2,Lyon Souveniers,+33 1 46 62 7555,Da Cunha,Daniel
3,Toys4GrownUps.com,6265557265,Young,Julie
4,Corporate Gift Ideas Co.,6505551386,Brown,Julie
5,Technics Stores Inc.,6505556809,Hirano,Juri
6,Daedalus Designs Imports,20.16.1555,Rance,Martine
7,Herkku Gifts,+47 2267 3215,Oeztan,Veysel
8,Mini Wheels Co.,6505555787,Murphy,Julie
9,Auto Canal Petit,(1) 47.55.6555,Perrier,Dominique


### 3. Create Products Dataset

In [24]:
# Select product-related columns. Drop duplicates to keep unique products.

products_cols = ['PRODUCTCODE', 'PRODUCTLINE', 'MSRP', 'DEALSIZE']
products_df = data[products_cols].drop_duplicates()

# Save products to CSV
products_df.to_csv('Exercise1_data/products.csv', index=False)

In [25]:
products_df.shape

(266, 4)

In [26]:
products_df.head(10)

Unnamed: 0,PRODUCTCODE,PRODUCTLINE,MSRP,DEALSIZE
0,S10_1678,Motorcycles,95,Small
2,S10_1678,Motorcycles,95,Medium
20,S10_1678,Motorcycles,95,Large
26,S10_1949,Classic Cars,214,Medium
27,S10_1949,Classic Cars,214,Large
51,S10_1949,Classic Cars,214,Small
54,S10_2016,Motorcycles,118,Medium
55,S10_2016,Motorcycles,118,Small
80,S10_4698,Motorcycles,193,Medium
81,S10_4698,Motorcycles,193,Large


### 4. Create Addresses Dataset

In [34]:
# Select address-related columns. Drop duplicates to keep unique addresses.

addresses_cols = ['ADDRESSLINE1', 'ADDRESSLINE2', 'CITY', 'STATE', 'POSTALCODE', 'COUNTRY', 'TERRITORY']

addresses_df = data[addresses_cols].drop_duplicates()

# Save addresses to CSV
addresses_df.to_csv('Exercise1_data/addresses.csv', index=False)

In [35]:
addresses_df.shape

(92, 7)

In [36]:
addresses_df.head(10)

Unnamed: 0,ADDRESSLINE1,ADDRESSLINE2,CITY,STATE,POSTALCODE,COUNTRY,TERRITORY
0,897 Long Airport Avenue,,NYC,NY,10022,USA,
1,59 rue de l'Abbaye,,Reims,,51100,France,EMEA
2,27 rue du Colonel Pierre Avia,,Paris,,75508,France,EMEA
3,78934 Hillside Dr.,,Pasadena,CA,90003,USA,
4,7734 Strong St.,,San Francisco,CA,,USA,
5,9408 Furth Circle,,Burlingame,CA,94217,USA,
6,"""184 chausse de Tournai""",,Lille,,59000,France,EMEA
7,"""Drammen 121 PR 744 Sentrum""",,Bergen,,N 5804,Norway,EMEA
8,5557 North Pendale Street,,San Francisco,CA,,USA,
9,"""25 rue Lauriston""",,Paris,,75016,France,EMEA


### Excercise 2 Data Splitting

In [37]:
data.columns.str.strip()

Index(['ORDERNUMBER', 'QUANTITYORDERED', 'PRICEEACH', 'ORDERLINENUMBER',
       'SALES', 'ORDERDATE', 'STATUS', 'QTR_ID', 'MONTH_ID', 'YEAR_ID',
       'PRODUCTLINE', 'MSRP', 'PRODUCTCODE', 'CUSTOMERNAME', 'PHONE',
       'ADDRESSLINE1', 'ADDRESSLINE2', 'CITY', 'STATE', 'POSTALCODE',
       'COUNTRY', 'TERRITORY', 'CONTACTLASTNAME', 'CONTACTFIRSTNAME',
       'DEALSIZE'],
      dtype='object')

### 1. Split for Fact Table: Orders

In [38]:
orders2_cols = [
    'ORDERNUMBER', 'ORDERDATE', 'SALES', 'STATUS', 'QTR_ID', 'MONTH_ID', 'YEAR_ID',
    'ORDERLINENUMBER', 'QUANTITYORDERED', 'PRICEEACH', 'PRODUCTCODE', 'CUSTOMERNAME'
]

orders2_df = data[orders2_cols]

orders2_df.to_csv('Exercise2_data/orders.csv', index=False)

In [39]:
orders2_df.shape

(2823, 12)

In [40]:
orders2_df.head(10)

Unnamed: 0,ORDERNUMBER,ORDERDATE,SALES,STATUS,QTR_ID,MONTH_ID,YEAR_ID,ORDERLINENUMBER,QUANTITYORDERED,PRICEEACH,PRODUCTCODE,CUSTOMERNAME
0,10107,2/24/2003 0:00,2871.0,Shipped,1,2,2003,2,30,95.7,S10_1678,Land of Toys Inc.
1,10121,5/7/2003 0:00,2765.9,Shipped,2,5,2003,5,34,81.35,S10_1678,Reims Collectables
2,10134,7/1/2003 0:00,3884.34,Shipped,3,7,2003,2,41,94.74,S10_1678,Lyon Souveniers
3,10145,8/25/2003 0:00,3746.7,Shipped,3,8,2003,6,45,83.26,S10_1678,Toys4GrownUps.com
4,10159,10/10/2003 0:00,5205.27,Shipped,4,10,2003,14,49,100.0,S10_1678,Corporate Gift Ideas Co.
5,10168,10/28/2003 0:00,3479.76,Shipped,4,10,2003,1,36,96.66,S10_1678,Technics Stores Inc.
6,10180,11/11/2003 0:00,2497.77,Shipped,4,11,2003,9,29,86.13,S10_1678,Daedalus Designs Imports
7,10188,11/18/2003 0:00,5512.32,Shipped,4,11,2003,1,48,100.0,S10_1678,Herkku Gifts
8,10201,12/1/2003 0:00,2168.54,Shipped,4,12,2003,2,22,98.57,S10_1678,Mini Wheels Co.
9,10211,1/15/2004 0:00,4708.44,Shipped,1,1,2004,14,41,100.0,S10_1678,Auto Canal Petit


### 2. Split for Customers Dimension

In [41]:
customers2_cols = ['CUSTOMERNAME', 'PHONE', 'CONTACTLASTNAME', 'CONTACTFIRSTNAME']

customers2_df = data[customers2_cols].drop_duplicates()

customers2_df.to_csv('Exercise2_data/customers.csv', index=False)

In [42]:
customers2_df.shape

(92, 4)

In [45]:
customers2_df.sample(10)

Unnamed: 0,CUSTOMERNAME,PHONE,CONTACTLASTNAME,CONTACTFIRSTNAME
37,Cambridge Collectables Co.,6175555555,Tseng,Kyung
171,"""Vida Sport Ltd""",0897-034555,Holz,Michael
8,Mini Wheels Co.,6505555787,Murphy,Julie
106,Danish Wholesale Imports,31 12 3555,Petersen,Jytte
121,Gifts4AllAges.com,6175559555,Yoshido,Juri
128,"""Oulu Toy Supplies Inc.""",981-443655,Koskitalo,Pirkko
464,"""Norway Gifts By Mail Co.""",+47 2212 1555,Klaeboe,Jan
937,Auto-Moto Classics Inc.,6175558428,Taylor,Leslie
34,"""Saveley & Henriot Co.""",78.32.5555,Saveley,Mary
32,"""Australian Gift Network Co""",61-7-3844-6555,Calaghan,Tony


### 3. Split for Products Dimension

In [46]:
products2_cols = ['PRODUCTCODE', 'PRODUCTLINE', 'MSRP', 'DEALSIZE']

products2_df = data[products2_cols].drop_duplicates()

products2_df.to_csv('Exercise2_data/products.csv', index=False)

In [47]:
products2_df.shape

(266, 4)

In [50]:
products2_df.sample(10)

Unnamed: 0,PRODUCTCODE,PRODUCTLINE,MSRP,DEALSIZE
1986,S24_3856,Classic Cars,140,Large
1771,S24_2887,Classic Cars,117,Small
268,S12_3148,Classic Cars,151,Large
1190,S18_3782,Motorcycles,62,Small
1269,S18_4409,Vintage Cars,92,Medium
1845,S24_3191,Classic Cars,85,Small
1633,S24_2022,Vintage Cars,44,Medium
2371,S50_1341,Vintage Cars,43,Medium
600,S18_1889,Classic Cars,77,Medium
2149,S32_1268,Trucks and Buses,96,Small


### 4. Split for Addresses Dimension

In [51]:
# --- Split for Addresses Dimension ---
addresses2_cols = ['ADDRESSLINE1', 'ADDRESSLINE2', 'CITY', 'STATE', 'POSTALCODE', 'COUNTRY', 'TERRITORY']

addresses2_df = data[addresses2_cols].drop_duplicates()

addresses2_df.to_csv('Exercise2_data/addresses.csv', index=False)

In [52]:
addresses2_df.shape

(92, 7)

In [53]:
addresses2_df.sample(10)

Unnamed: 0,ADDRESSLINE1,ADDRESSLINE2,CITY,STATE,POSTALCODE,COUNTRY,TERRITORY
142,6251 Ingle Ln.,,Boston,MA,51003,USA,
44,5677 Strong St.,,San Rafael,CA,97562,USA,
70,"""54 rue Royale""",,Nantes,,44000,France,EMEA
13,25593 South Bay Ln.,,Bridgewater,CT,97562,USA,
196,25 Maiden Lane,Floor No. 4,Dublin,,2,Ireland,EMEA
116,361 Furth Circle,,San Diego,CA,91217,USA,
481,Via Ludovico il Moro 22,,Bergamo,,24100,Italy,EMEA
7,"""Drammen 121 PR 744 Sentrum""",,Bergen,,N 5804,Norway,EMEA
462,8489 Strong St.,,Las Vegas,NV,83030,USA,
131,3086 Ingle Ln.,,San Jose,CA,94217,USA,


In [54]:
## more analysis on the dataset