In [1]:
import pandas as pd
import os 

In [2]:
folder_path = 'Sales_Data'

csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv') and file != 'all_data.csv']

concatenated_df = pd.DataFrame()

for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    concatenated_df = pd.concat([concatenated_df, df], ignore_index=True)

df = concatenated_df


In [3]:
df = pd.read_csv("Sales_Data/Sales_January_2019.csv")

# Dropping all rows with NA values for all columns

In [4]:
df.dropna(how="all", inplace=True)

# make columns easier to write 

In [5]:
# df.columns.str.lower()
df.columns = df.columns.str.replace(" ", "_")
df.columns = df.columns.str.lower()
df.head()

Unnamed: 0,order_id,product,quantity_ordered,price_each,order_date,purchase_address
0,141234,iPhone,1,700.0,01/22/19 21:25,"944 Walnut St, Boston, MA 02215"
1,141235,Lightning Charging Cable,1,14.95,01/28/19 14:15,"185 Maple St, Portland, OR 97035"
2,141236,Wired Headphones,2,11.99,01/17/19 13:33,"538 Adams St, San Francisco, CA 94016"
3,141237,27in FHD Monitor,1,149.99,01/05/19 20:33,"738 10th St, Los Angeles, CA 90001"
4,141238,Wired Headphones,1,11.99,01/25/19 11:59,"387 10th St, Austin, TX 73301"


# Transform columns datatypes to correct ones

In [6]:
df.dtypes

order_id            object
product             object
quantity_ordered    object
price_each          object
order_date          object
purchase_address    object
dtype: object

In [7]:
# New Datatypes
columns_data_types = ['object', 'category', 'int', 'float', 'datetime64', 'object']
convert_dict = {"product": 'category',
                "quantity_ordered": 'float',
                "price_each": 'float', 
                "order_date": 'datetime64[ns]'
                }

# Update df to not containing any of "Quantity Ordered" in quantity_ordered column
df = df[~(df.quantity_ordered == "Quantity Ordered")]
df = df.astype(convert_dict)


In [8]:
df.dtypes

order_id                    object
product                   category
quantity_ordered           float64
price_each                 float64
order_date          datetime64[ns]
purchase_address            object
dtype: object

In [9]:
df.head()

Unnamed: 0,order_id,product,quantity_ordered,price_each,order_date,purchase_address
0,141234,iPhone,1.0,700.0,2019-01-22 21:25:00,"944 Walnut St, Boston, MA 02215"
1,141235,Lightning Charging Cable,1.0,14.95,2019-01-28 14:15:00,"185 Maple St, Portland, OR 97035"
2,141236,Wired Headphones,2.0,11.99,2019-01-17 13:33:00,"538 Adams St, San Francisco, CA 94016"
3,141237,27in FHD Monitor,1.0,149.99,2019-01-05 20:33:00,"738 10th St, Los Angeles, CA 90001"
4,141238,Wired Headphones,1.0,11.99,2019-01-25 11:59:00,"387 10th St, Austin, TX 73301"


### Splitting purchase_address column to 4 columns ["street", "city", "state", "zip_code"]

In [10]:
df[['street', 'city', 'state']] = df['purchase_address'].str.split(", ", expand=True)
df[['state', 'zip_code']] = df['state'].str.split(" ", expand=True)
df.head()

Unnamed: 0,order_id,product,quantity_ordered,price_each,order_date,purchase_address,street,city,state,zip_code
0,141234,iPhone,1.0,700.0,2019-01-22 21:25:00,"944 Walnut St, Boston, MA 02215",944 Walnut St,Boston,MA,2215
1,141235,Lightning Charging Cable,1.0,14.95,2019-01-28 14:15:00,"185 Maple St, Portland, OR 97035",185 Maple St,Portland,OR,97035
2,141236,Wired Headphones,2.0,11.99,2019-01-17 13:33:00,"538 Adams St, San Francisco, CA 94016",538 Adams St,San Francisco,CA,94016
3,141237,27in FHD Monitor,1.0,149.99,2019-01-05 20:33:00,"738 10th St, Los Angeles, CA 90001",738 10th St,Los Angeles,CA,90001
4,141238,Wired Headphones,1.0,11.99,2019-01-25 11:59:00,"387 10th St, Austin, TX 73301",387 10th St,Austin,TX,73301


### Split order_date to ['year', 'moth', 'day', 'weekday']

In [11]:
df['year'] = df.order_date.dt.year
df['month'] = df.order_date.dt.month
df['day'] = df.order_date.dt.day
df['weekday'] = df.order_date.dt.day_name() 
df['hour'] = df.order_date.dt.hour
df.head()

Unnamed: 0,order_id,product,quantity_ordered,price_each,order_date,purchase_address,street,city,state,zip_code,year,month,day,weekday,hour
0,141234,iPhone,1.0,700.0,2019-01-22 21:25:00,"944 Walnut St, Boston, MA 02215",944 Walnut St,Boston,MA,2215,2019,1,22,Tuesday,21
1,141235,Lightning Charging Cable,1.0,14.95,2019-01-28 14:15:00,"185 Maple St, Portland, OR 97035",185 Maple St,Portland,OR,97035,2019,1,28,Monday,14
2,141236,Wired Headphones,2.0,11.99,2019-01-17 13:33:00,"538 Adams St, San Francisco, CA 94016",538 Adams St,San Francisco,CA,94016,2019,1,17,Thursday,13
3,141237,27in FHD Monitor,1.0,149.99,2019-01-05 20:33:00,"738 10th St, Los Angeles, CA 90001",738 10th St,Los Angeles,CA,90001,2019,1,5,Saturday,20
4,141238,Wired Headphones,1.0,11.99,2019-01-25 11:59:00,"387 10th St, Austin, TX 73301",387 10th St,Austin,TX,73301,2019,1,25,Friday,11


In [12]:
df['total_price'] = df.quantity_ordered * df.price_each
df.head()

Unnamed: 0,order_id,product,quantity_ordered,price_each,order_date,purchase_address,street,city,state,zip_code,year,month,day,weekday,hour,total_price
0,141234,iPhone,1.0,700.0,2019-01-22 21:25:00,"944 Walnut St, Boston, MA 02215",944 Walnut St,Boston,MA,2215,2019,1,22,Tuesday,21,700.0
1,141235,Lightning Charging Cable,1.0,14.95,2019-01-28 14:15:00,"185 Maple St, Portland, OR 97035",185 Maple St,Portland,OR,97035,2019,1,28,Monday,14,14.95
2,141236,Wired Headphones,2.0,11.99,2019-01-17 13:33:00,"538 Adams St, San Francisco, CA 94016",538 Adams St,San Francisco,CA,94016,2019,1,17,Thursday,13,23.98
3,141237,27in FHD Monitor,1.0,149.99,2019-01-05 20:33:00,"738 10th St, Los Angeles, CA 90001",738 10th St,Los Angeles,CA,90001,2019,1,5,Saturday,20,149.99
4,141238,Wired Headphones,1.0,11.99,2019-01-25 11:59:00,"387 10th St, Austin, TX 73301",387 10th St,Austin,TX,73301,2019,1,25,Friday,11,11.99


In [13]:
df.columns
new_ordered_col = [
    'order_id', 'product', 'quantity_ordered', 'price_each', 'total_price',
    'order_date', 'month', 'day', 'weekday', 'hour',
    'purchase_address', 'street', 'city', 'state', 'zip_code', 'year'
    ]

df = df[new_ordered_col]
df.head()

Unnamed: 0,order_id,product,quantity_ordered,price_each,total_price,order_date,month,day,weekday,hour,purchase_address,street,city,state,zip_code,year
0,141234,iPhone,1.0,700.0,700.0,2019-01-22 21:25:00,1,22,Tuesday,21,"944 Walnut St, Boston, MA 02215",944 Walnut St,Boston,MA,2215,2019
1,141235,Lightning Charging Cable,1.0,14.95,14.95,2019-01-28 14:15:00,1,28,Monday,14,"185 Maple St, Portland, OR 97035",185 Maple St,Portland,OR,97035,2019
2,141236,Wired Headphones,2.0,11.99,23.98,2019-01-17 13:33:00,1,17,Thursday,13,"538 Adams St, San Francisco, CA 94016",538 Adams St,San Francisco,CA,94016,2019
3,141237,27in FHD Monitor,1.0,149.99,149.99,2019-01-05 20:33:00,1,5,Saturday,20,"738 10th St, Los Angeles, CA 90001",738 10th St,Los Angeles,CA,90001,2019
4,141238,Wired Headphones,1.0,11.99,11.99,2019-01-25 11:59:00,1,25,Friday,11,"387 10th St, Austin, TX 73301",387 10th St,Austin,TX,73301,2019


In [14]:
df = df.sort_values(by=['order_date'])
# df.to_csv("cleaned_data.csv", index=False)

In [15]:
df.head()

Unnamed: 0,order_id,product,quantity_ordered,price_each,total_price,order_date,month,day,weekday,hour,purchase_address,street,city,state,zip_code,year
6344,147268,Wired Headphones,1.0,11.99,11.99,2019-01-01 03:07:00,1,1,Tuesday,3,"9 Lake St, New York City, NY 10001",9 Lake St,New York City,NY,10001,2019
7154,148041,USB-C Charging Cable,1.0,11.95,11.95,2019-01-01 03:40:00,1,1,Tuesday,3,"760 Church St, San Francisco, CA 94016",760 Church St,San Francisco,CA,94016,2019
8507,149343,Apple Airpods Headphones,1.0,150.0,150.0,2019-01-01 04:56:00,1,1,Tuesday,4,"735 5th St, New York City, NY 10001",735 5th St,New York City,NY,10001,2019
9161,149964,AAA Batteries (4-pack),1.0,2.99,2.99,2019-01-01 05:53:00,1,1,Tuesday,5,"75 Jackson St, Dallas, TX 75001",75 Jackson St,Dallas,TX,75001,2019
8514,149350,USB-C Charging Cable,2.0,11.95,23.9,2019-01-01 06:03:00,1,1,Tuesday,6,"943 2nd St, Atlanta, GA 30301",943 2nd St,Atlanta,GA,30301,2019
