In [2]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

def data_import(folder_path: str) -> pd.DataFrame:
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    final_df = None

    if len(csv_files) == 0:
        print("Files not found.")
    else:

        df_list = []
        for arquivo in csv_files:
            file_path = os.path.join(folder_path, arquivo)
            df = pd.read_csv(file_path, sep=',')
            df_cleaned = df.dropna()
            df_cleaned = df_cleaned[df_cleaned['Order Date'] != 'Order Date']
            df_list.append(df_cleaned)

        final_df = pd.concat(df_list, ignore_index=False).drop(columns=['Unnamed: 0'])

    final_df.to_csv('data/output/sales.csv')
    return final_df

path = 'data/sales'
df = data_import(path)


In [5]:
df['Date'] = pd.to_datetime(df['Order Date'], format='%m/%d/%y %H:%M')
df['Order ID'] = pd.to_numeric(df['Order ID'])
df['Quantity Ordered'] = pd.to_numeric(df['Quantity Ordered'])
df['Price Each'] = pd.to_numeric(df['Price Each'])
df.set_index('Date', inplace=True)

df

Unnamed: 0_level_0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-04-19 08:46:00,176558,USB-C Charging Cable,2,11.95,04/19/19 08:46,"917 1st St, Dallas, TX 75001"
2019-04-07 22:30:00,176559,Bose SoundSport Headphones,1,99.99,04/07/19 22:30,"682 Chestnut St, Boston, MA 02215"
2019-04-12 14:38:00,176560,Google Phone,1,600.00,04/12/19 14:38,"669 Spruce St, Los Angeles, CA 90001"
2019-04-12 14:38:00,176560,Wired Headphones,1,11.99,04/12/19 14:38,"669 Spruce St, Los Angeles, CA 90001"
2019-04-30 09:27:00,176561,Wired Headphones,1,11.99,04/30/19 09:27,"333 8th St, Los Angeles, CA 90001"
...,...,...,...,...,...,...
2019-09-17 20:56:00,259353,AAA Batteries (4-pack),3,2.99,09/17/19 20:56,"840 Highland St, Los Angeles, CA 90001"
2019-09-01 16:00:00,259354,iPhone,1,700.00,09/01/19 16:00,"216 Dogwood St, San Francisco, CA 94016"
2019-09-23 07:39:00,259355,iPhone,1,700.00,09/23/19 07:39,"220 12th St, San Francisco, CA 94016"
2019-09-19 17:30:00,259356,34in Ultrawide Monitor,1,379.99,09/19/19 17:30,"511 Forest St, San Francisco, CA 94016"


In [14]:
list_of_products = df['Product'].unique()
print(list_of_products)

usb_c_charging_cable = df[df['Product'] == 'USB-C Charging Cable']
bose_soundsport_headphone = df[df['Product'] == 'Bose SoundSport Headphones']
google_phone = df[df['Product'] == 'Google Phone']
wired_headphones = df[df['Product'] == 'Wired Headphones']
macbook = df[df['Product'] == 'Macbook Pro Laptop']
lighting_charge_cable = df[df['Product'] == 'Lightning Charging Cable']
gaming_monitor = df[df['Product'] == '27in 4K Gaming Monitor']
aa_batteries = df[df['Product'] == 'AA Batteries (4-pack)']
apple_airpods = df[df['Product'] == 'Apple Airpods Headphones']
aaa_batteries = df[df['Product'] == 'AAA Batteries (4-pack)']
iphone = df[df['Product'] == 'iPhone']
flatscreen_tv = df[df['Product'] == 'Flatscreen TV']
twentysevenin_fhd_monitor = df[df['Product'] == '27in FHD Monitor']
twentyin_monitor = df[df['Product'] == '20in Monitor']
lg_dryer = df[df['Product'] == 'LG Dryer']
thinkpad_laptop = df[df['Product'] == 'ThinkPad Laptop']
vareebadd_phone = df[df['Product'] == 'Vareebadd Phone']
lg_washing_machine = df[df['Product'] == 'LG Washing Machine']
thirtyfourin_ultrawide_monitor = df[df['Product'] == '34in Ultrawide Monitor']

product_df_list = [
    usb_c_charging_cable, bose_soundsport_headphone, google_phone, wired_headphones, macbook, lighting_charge_cable, gaming_monitor,
    aa_batteries, apple_airpods, aaa_batteries, iphone, flatscreen_tv, twentysevenin_fhd_monitor, twentyin_monitor, lg_dryer,
    thinkpad_laptop, vareebadd_phone, lg_washing_machine, thirtyfourin_ultrawide_monitor
]

['USB-C Charging Cable' 'Bose SoundSport Headphones' 'Google Phone'
 'Wired Headphones' 'Macbook Pro Laptop' 'Lightning Charging Cable'
 '27in 4K Gaming Monitor' 'AA Batteries (4-pack)'
 'Apple Airpods Headphones' 'AAA Batteries (4-pack)' 'iPhone'
 'Flatscreen TV' '27in FHD Monitor' '20in Monitor' 'LG Dryer'
 'ThinkPad Laptop' 'Vareebadd Phone' 'LG Washing Machine'
 '34in Ultrawide Monitor']


In [24]:
# total price per day
for product_df in product_df_list:
    product_df['Total Sold'] = product_df['Quantity Ordered'] * product_df['Price Each']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  product_df['Total Sold'] = product_df['Quantity Ordered'] * product_df['Price Each']


Unnamed: 0_level_0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address,Total Sold
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-04-12 14:38:00,176560,Wired Headphones,1,11.99,04/12/19 14:38,"669 Spruce St, Los Angeles, CA 90001",11.99
2019-04-30 09:27:00,176561,Wired Headphones,1,11.99,04/30/19 09:27,"333 8th St, Los Angeles, CA 90001",11.99
2019-04-08 14:05:00,176566,Wired Headphones,1,11.99,04/08/19 14:05,"83 7th St, Boston, MA 02215",11.99
2019-04-17 23:04:00,176594,Wired Headphones,1,11.99,04/17/19 23:04,"63 Maple St, San Francisco, CA 94016",11.99
2019-04-02 09:11:00,176595,Wired Headphones,3,11.99,04/02/19 09:11,"383 6th St, Los Angeles, CA 90001",35.97
...,...,...,...,...,...,...,...
2019-09-17 22:07:00,259300,Wired Headphones,1,11.99,09/17/19 22:07,"932 13th St, San Francisco, CA 94016",11.99
2019-09-09 12:55:00,259309,Wired Headphones,1,11.99,09/09/19 12:55,"865 Ridge St, Atlanta, GA 30301",11.99
2019-09-02 19:51:00,259312,Wired Headphones,1,11.99,09/02/19 19:51,"573 Sunset St, San Francisco, CA 94016",11.99
2019-09-16 00:25:00,259314,Wired Headphones,1,11.99,09/16/19 00:25,"241 Highland St, Atlanta, GA 30301",11.99


In [27]:
# correlation
correlation_coefficient = lg_washing_machine['Total Sold'].corr(lg_dryer['Total Sold'], method='pearson')
print(correlation_coefficient)


nan


  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
