In [None]:
import pandas as pd

customer = pd.read_csv('customers.csv')
sales = pd.read_csv('sales.csv')
print(customer.head())
print(sales.head())


print(f'Customer : Rows: {customer.shape[0]}, Columns: {customer.shape[1]}')
print(f'Sales : Rows: {sales.shape[0]}, Columns: {sales.shape[1]}')

print(f'\nCustomer Null Values:\n {customer.isnull().sum()}')
print(f'\nSales Null Values:\n {sales.isnull().sum()}')

numerical_columns_customer = customer.select_dtypes(include=['float64', 'int64']).columns
numerical_columns_sales = sales.select_dtypes(include=['float64', 'int64']).columns

customer[numerical_columns_customer] = customer[numerical_columns_customer].fillna(customer[numerical_columns_customer].mean())
sales[numerical_columns_sales] = sales[numerical_columns_sales].fillna(sales[numerical_columns_sales].mean())

categorical_columns_customer = customer.select_dtypes(include=['object']).columns
categorical_columns_sales = sales.select_dtypes(include=['object']).columns

customer[categorical_columns_customer] = customer[categorical_columns_customer].fillna(customer[categorical_columns_customer].mode().iloc[0])
sales[categorical_columns_sales] = sales[categorical_columns_sales].fillna(sales[categorical_columns_sales].mode().iloc[0])

print(f'\nCustomer Null Values after filling:\n {customer.isnull().sum()}')
print(f'\nSales Null Values after filling:\n {sales.isnull().sum()}')


In [None]:
import time
customers_dict = customer.to_dict(orient="records")

city_to_filter = "Chicago"

start_dict = time.time()
customers_dict_2 = [customer for customer in customers_dict if customer["City"] == city_to_filter]
end_dict = time.time()

start_df = time.time()
customers_df = customer[customer["City"] == city_to_filter]
end_df = time.time()

print("Using Dictionary:")
print(customers_dict_2)
print("Using DataFrame:")
print(customers_df)

print("Dictionary time:", end_dict - start_dict)
print("DataFrame time:", end_df - start_df)


In [None]:
print("Duplicate Rows in Customer:")
print(customer[customer.duplicated()])

print("\nDuplicate Rows in Sales:")
print(sales[sales.duplicated()])

customer_clean = customer.drop_duplicates()
sales_clean = sales.drop_duplicates()

print("\nDuplicates left in Customer", customer_clean.duplicated().any())
print("Duplicates left in Sales", sales_clean.duplicated().any())

customer_clean.to_csv('cleaned_customers.csv', index=False)
sales_clean.to_csv('cleaned_sales.csv', index=False)


In [None]:
sales_clean['Discounted_Amount'] = sales_clean['Amount'] * 0.9

grouped_sales = sales_clean.groupby('Product')['Discounted_Amount'].sum().reset_index()

grouped_sales.columns = ['Product', 'Total_Sales']

print("\nTotal Sales by Product (after 10% discount):")
print(grouped_sales)

grouped_sales.to_csv('grouped_sales.csv', index=False)


In [None]:
filtered_customers = customer[(customer['Age'] >= 25) & (customer['Age'] <= 35)]
filtered_customers.to_csv('filtered_customer.csv', index=False)
city_counts = filtered_customers['City'].value_counts()

print("\nCustomers Aged 25 to 35:")
print(filtered_customers)

print("\nNumber of Customers by City:")
print(city_counts)


In [None]:
merged_data = pd.merge(customer, sales, on='CustomerID', how='inner')
print("Merged Dataset:")
print(merged_data.head())

city_sales = merged_data.groupby('City')['Amount'].sum().reset_index()
city_sales = city_sales.sort_values(by='Amount', ascending=False)
highest_sales_city = city_sales.iloc[0]

print("\nCity with the Highest Total Sales:")
print(highest_sales_city)

product_sales = merged_data.groupby('Product')['Amount'].count().reset_index()
product_sales = product_sales.sort_values(by='Amount', ascending=False)
most_sold_product = product_sales.iloc[0]

print("\nProduct with the Most Units Sold:")
print(most_sold_product)


In [None]:
unique_cities = merged_data['City'].unique()
unique_products = merged_data['Product'].unique()

print("Unique Values in 'City' Column:")
print(unique_cities)

print("\nUnique Values in 'Product' Column:")
print(unique_products)

mean_amount = merged_data['Amount'].mean()
median_amount = merged_data['Amount'].median()

print("\nMean of 'Amount' Column:")
print(mean_amount)

print("\nMedian of 'Amount' Column:")
print(median_amount)
