#### IDEAS:

- Allow for user input:
    - enter what categories you want to see
    - enter to separate the data into male and female
    - allow for the calculations for said categories

### Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("D:sample_dataset.csv")

### Data Analysis

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Customer ID         50000 non-null  int64  
 1   Name                50000 non-null  object 
 2   Surname             50000 non-null  object 
 3   Gender              45022 non-null  object 
 4   Birthdate           50000 non-null  object 
 5   Transaction Amount  50000 non-null  float64
 6   Date                50000 non-null  object 
 7   Merchant Name       50000 non-null  object 
 8   Category            50000 non-null  object 
dtypes: float64(1), int64(1), object(7)
memory usage: 3.4+ MB


In [4]:
df.describe()


Unnamed: 0,Customer ID,Transaction Amount
count,50000.0,50000.0
mean,500136.79696,442.119239
std,288232.43164,631.669724
min,29.0,5.01
25%,251191.5,79.0075
50%,499520.5,182.195
75%,749854.25,470.515
max,999997.0,2999.88


#### Dropping N/A from data set

In [5]:
df2 = pd.read_csv("D:sample_dataset.csv")

In [6]:
df2.dropna().info()

<class 'pandas.core.frame.DataFrame'>
Index: 45022 entries, 0 to 49999
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Customer ID         45022 non-null  int64  
 1   Name                45022 non-null  object 
 2   Surname             45022 non-null  object 
 3   Gender              45022 non-null  object 
 4   Birthdate           45022 non-null  object 
 5   Transaction Amount  45022 non-null  float64
 6   Date                45022 non-null  object 
 7   Merchant Name       45022 non-null  object 
 8   Category            45022 non-null  object 
dtypes: float64(1), int64(1), object(7)
memory usage: 3.4+ MB


In [7]:
df2.dropna().describe()

Unnamed: 0,Customer ID,Transaction Amount
count,45022.0,45022.0
mean,500299.673582,444.810904
std,288268.017399,637.440223
min,29.0,5.01
25%,251230.25,78.92
50%,499528.5,181.985
75%,749996.75,471.6475
max,999997.0,2999.88


### Data Separation

In [8]:
# Categories For Shopping
for i in df2['Category'].drop_duplicates():
    print(i)

Cosmetic
Travel
Clothing
Electronics
Restaurant
Market


In [9]:
wmt = df2[df2['Gender'] == 'M']
wmt # Whole Male Transactions

Unnamed: 0,Customer ID,Name,Surname,Gender,Birthdate,Transaction Amount,Date,Merchant Name,Category
0,752858,Sean,Rodriguez,M,10/20/2002,2999.88,4/3/2023,Smith-Russell,Cosmetic
2,305449,Jacob,Williams,M,10/25/1981,2999.22,9/20/2023,Steele Inc,Clothing
3,988259,Nathan,Snyder,M,10/26/1977,2998.51,1/11/2023,"Wilson, Wilson and Russell",Cosmetic
6,124681,Thomas,Shaw,M,10/26/1976,2997.11,8/30/2023,"Evans, Griffin and Torres",Cosmetic
8,504238,Denise,Porter,M,10/31/1957,2996.85,1/12/2023,Jackson-Morgan,Restaurant
...,...,...,...,...,...,...,...,...,...
49979,366368,Christy,Wagner,M,10/23/1988,5.92,2/13/2023,"Black, Allen and Taylor",Clothing
49982,728409,Debbie,Griffin,M,10/22/1994,5.70,8/5/2023,James and Sons,Travel
49983,91701,Joseph,Lawson,M,10/24/1985,5.65,4/21/2023,Davis Ltd,Travel
49997,133285,Nicole,Franklin,M,10/26/1979,5.12,2/12/2023,"Cantrell, Haynes and Ballard",Market


In [10]:
wft = df2[(df2['Gender'] == 'F')]
wft # Whole Female Transactions

Unnamed: 0,Customer ID,Name,Surname,Gender,Birthdate,Transaction Amount,Date,Merchant Name,Category
1,26381,Michelle,Phelps,F,10/24/1985,2999.68,7/17/2023,"Peck, Spence and Young",Travel
4,764762,Crystal,Knapp,F,11/2/1951,2998.48,6/13/2023,Palmer-Hinton,Electronics
5,576539,Monica,Bartlett,F,10/20/2001,2997.81,8/24/2023,"Tran, Torres and Joyce",Cosmetic
7,521807,Kelsey,Pena,F,10/28/1968,2996.86,2/8/2023,Miller PLC,Cosmetic
9,583532,Alexander,Richardson,F,10/27/1974,2996.45,6/7/2023,"Blake, Mays and Anderson",Clothing
...,...,...,...,...,...,...,...,...,...
49993,754013,Katherine,Avery,F,10/30/1961,5.23,8/16/2023,"Wallace, Kim and Rosales",Restaurant
49994,197066,George,Cooper,F,10/21/1998,5.22,5/29/2023,Kim and Sons,Market
49995,891845,Christine,Leach,F,10/21/1997,5.22,8/30/2023,Alexander Ltd,Market
49996,800560,Anna,Allen,F,10/21/1999,5.16,5/3/2023,Knapp-Calhoun,Cosmetic


### Category Counts

In [11]:
# Male Category Counts

print(wmt['Category'].value_counts().to_frame())
print(f"Total:       {wmt['Category'].value_counts().sum():,}")

             count
Category          
Market        3761
Electronics   3747
Clothing      3744
Travel        3705
Restaurant    3694
Cosmetic      3625
Total:       22,276


In [12]:
# Female Category Counts

print(wft['Category'].value_counts().to_frame())
print(f"Total:       {wft['Category'].value_counts().sum():,}")

             count
Category          
Travel        3871
Restaurant    3835
Cosmetic      3825
Electronics   3801
Market        3745
Clothing      3669
Total:       22,746


### Data Generation
(Turn into user input)

In [13]:
# Male Travel Percentage of Total Category

# Clothing
# Cosmetic
# Electronics
# Market
# Restaurant
# Travel

wmt_cats = len(wmt['Category'])

clothing_wmt = (wmt['Category'] == "Clothing").sum()
cosmetic_wmt = (wmt['Category'] == "Cosmetic").sum()
electronics_wmt = (wmt['Category'] == "Electronics").sum()
market_wmt = (wmt['Category'] == "Market").sum()
restaurant_wmt = (wmt['Category'] == "Restaurant").sum()
travel_wmt = (wmt['Category'] == "Travel").sum()

male_clothing_percent = (clothing_wmt / wmt_cats) * 100
male_cosmetic_percent = (cosmetic_wmt / wmt_cats) * 100
male_electronics_percent = (electronics_wmt / wmt_cats) * 100
male_market_percent = (market_wmt / wmt_cats) * 100
male_restaurant_percent = (restaurant_wmt / wmt_cats) * 100
male_travel_percent = (travel_wmt / wmt_cats) * 100

total_percent = (male_clothing_percent + male_cosmetic_percent + male_electronics_percent + male_market_percent + male_restaurant_percent + male_travel_percent)

print("|============|Stats|============|")

print("-" * 17)
print("Counts: ")
print("-" * 17)

print(f"Market: {market_wmt:,}")
print(f"Electronics: {electronics_wmt:,}")
print(f"Clothing: {clothing_wmt:,}")
print(f"Traveling: {travel_wmt:,}")
print(f"Restaurant: {restaurant_wmt:,}")
print(f"Cosmetic: {cosmetic_wmt:,}")

print("=" * 33)

print(f"Sum of Category: {wmt_cats:,}")

print("-" * 17)
print("Percentages: ")
print("-" * 17)

print(f"Market: {round(male_market_percent, 2)}%")
print(f"Electronics: {round(male_electronics_percent, 2)}%")
print(f"Clothing: {round(male_clothing_percent, 2)}%")
print(f"Travel: {round(male_travel_percent, 2)}%")
print(f"Restaurant: {round(male_restaurant_percent, 2)}%")
print(f"Cosmetic: {round(male_cosmetic_percent, 2)}%")

print("=" * 33)

print(f"Total Percentage: {round(total_percent, 2)}%")

-----------------
Counts: 
-----------------
Market: 3,761
Electronics: 3,747
Clothing: 3,744
Traveling: 3,705
Restaurant: 3,694
Cosmetic: 3,625
Sum of Category: 22,276
-----------------
Percentages: 
-----------------
Market: 16.88%
Electronics: 16.82%
Clothing: 16.81%
Travel: 16.63%
Restaurant: 16.58%
Cosmetic: 16.27%
Total Percentage: 100.0%


In [14]:
# Female Travel Percentage of Total Category
# Clothing
# Cosmetic
# Electronics
# Market
# Restaurant
# Travel

wft_cats = len(wft['Category'])

clothing_wft = (wft['Category'] == "Clothing").sum()
cosmetic_wft = (wft['Category'] == "Cosmetic").sum()
electronics_wft = (wft['Category'] == "Electronics").sum()
market_wft = (wft['Category'] == "Market").sum()
restaurant_wft = (wft['Category'] == "Restaurant").sum()
travel_wft = (wft['Category'] == "Travel").sum()

female_clothing_percent = (clothing_wft / wft_cats) * 100
female_cosmetic_percent = (cosmetic_wft / wft_cats) * 100
female_electronics_percent = (electronics_wft / wft_cats) * 100
female_market_percent = (market_wft / wft_cats) * 100
female_restaurant_percent = (restaurant_wft / wft_cats) * 100
female_travel_percent = (travel_wft / wft_cats) * 100

total_percent = (female_clothing_percent + female_cosmetic_percent + female_electronics_percent + female_market_percent + female_restaurant_percent + female_travel_percent)

print("|============|Stats|============|")

print("-" * 17)
print("Counts: ")
print("-" * 17)

print(f"Traveling: {travel_wft:,}")
print(f"Restaurant: {restaurant_wft:,}")
print(f"Cosmetic: {cosmetic_wft:,}")
print(f"Electronics: {electronics_wft:,}")
print(f"Market: {market_wft:,}")
print(f"Clothing: {clothing_wft:,}")

print("=" * 33)

print(f"Sum of Category: {wft_cats:,}")

print("-" * 17)
print("Percentages: ")
print("-" * 17)

print(f"Travel: {round(female_travel_percent, 2)}%")
print(f"Restaurant: {round(female_restaurant_percent, 2)}%")
print(f"Cosmetic: {round(female_cosmetic_percent, 2)}%")
print(f"Electronics: {round(female_electronics_percent, 2)}%")
print(f"Market: {round(female_market_percent, 2)}%")
print(f"Clothing: {round(female_clothing_percent, 2)}%")

print("=" * 33)

print(f"Total Percentage: {round(total_percent, 2)}%")

-----------------
Counts: 
-----------------
Traveling: 3,871
Restaurant: 3,835
Cosmetic: 3,825
Electronics: 3,801
Market: 3,745
Clothing: 3,669
Sum of Category: 22,746
-----------------
Percentages: 
-----------------
Travel: 17.02%
Restaurant: 16.86%
Cosmetic: 16.82%
Electronics: 16.71%
Market: 16.46%
Clothing: 16.13%
Total Percentage: 100.0%


In [15]:
print(f"Total in both categories: {(wmt_cats + wft_cats):,}")

Total in both categories: 45,022


##### Difference Between Categories

In [16]:
# Differences Between Male & Female Categories
# Cosmetic
# Travel
# Clothing
# Electronics
# Restaurant
# Market

print("Diff Amoung Each Male & Female Categories")

# Example data for "whole male transactions" and "whole female transactions"
wmt1 = [wmt_cats, cosmetic_wmt, travel_wmt, clothing_wmt, electronics_wmt, restaurant_wmt, market_wmt]
wft2 = [wft_cats, cosmetic_wft, travel_wft, clothing_wft, electronics_wft, restaurant_wft, market_wft]

# Calculate MAE
n = wmt_cats + wft_cats
mae = sum(abs(wmt_value - wft_value) for wmt_value, wft_value in zip(wmt1, wft2)) / n * 100

print(f"Data Diff: {abs(wmt_cats - wft_cats)}, As a Percentage: {round(abs((wmt_cats - wft_cats) / (wmt_cats + wft_cats) * 100), 2)}%")

print(f"Diff btwn cosmetic: {abs(cosmetic_wmt - cosmetic_wft)}, As Percentage: {round(abs((cosmetic_wmt - cosmetic_wft) / (cosmetic_wmt + cosmetic_wft) * 100), 2)}%")
print(f"Diff btwn travel: {abs(travel_wmt - travel_wft)}, As Percentage: {round(abs((travel_wmt - travel_wft) / (travel_wmt + travel_wft) * 100), 2)}%")
print(f"Diff btwn clothing: {abs(clothing_wmt - clothing_wft)}, As Percentage: {round(abs((clothing_wmt - clothing_wft) / (clothing_wmt + clothing_wft) * 100), 2)}%")
print(f"Diff btwn electronics: {abs(electronics_wmt - electronics_wft)}, As Percentage: {round(abs((electronics_wmt - electronics_wft) / (electronics_wmt + electronics_wft) * 100), 2)}%")
print(f"Diff btwn resturant: {abs(restaurant_wmt - restaurant_wft)}, As Percentage: {round(abs((restaurant_wmt - restaurant_wft) / (restaurant_wmt + restaurant_wft) * 100), 2)}%")
print(f"Diff btwn market: {abs(market_wmt - market_wft)}, As Percentage: {round(abs((market_wmt - market_wft) / (market_wmt + market_wft) * 100), 2)}%")

print(f"Mean Absolute Error: {round(mae, 2)}%")


Diff Amoung Each Male & Female Categories
Data Diff: 470, As a Percentage: 1.04%
Diff btwn cosmetic: 200, As Percentage: 2.68%
Diff btwn travel: 166, As Percentage: 2.19%
Diff btwn clothing: 75, As Percentage: 1.01%
Diff btwn electronics: 54, As Percentage: 0.72%
Diff btwn resturant: 141, As Percentage: 1.87%
Diff btwn market: 16, As Percentage: 0.21%
Mean Absolute Error: 2.49%


### Transaction Sizes

In [17]:
# Find the row with the largest transaction amount (Men's)
largest_transaction_row = wmt[wmt['Transaction Amount'] == wmt['Transaction Amount'].max()]

# Access the 'Name' and 'Surname' columns in that row
first_name = largest_transaction_row['Name'].values[0]
last_name = largest_transaction_row['Surname'].values[0]
the_category = largest_transaction_row['Category'].values[0]

# n = int(input("How many people would you like to see?: "))

# Iterate through the top 10 rows and print information for each row
for index, row in wmt.head(10).iterrows():
    transaction_amount = row['Transaction Amount']
    name = row['Name']
    surname = row['Surname']
    category = row['Category']
    print(f"{name} {surname}, Amount: ${transaction_amount}, Category: {category}")


Sean Rodriguez, Amount: $2999.88, Category: Cosmetic
Jacob Williams, Amount: $2999.22, Category: Clothing
Nathan Snyder, Amount: $2998.51, Category: Cosmetic
Thomas Shaw, Amount: $2997.11, Category: Cosmetic
Denise Porter, Amount: $2996.85, Category: Restaurant
Kimberly Ball, Amount: $2996.42, Category: Travel
Hunter Peters, Amount: $2996.2, Category: Market
Julie Russell, Amount: $2995.79, Category: Cosmetic
Christopher Harris, Amount: $2995.32, Category: Cosmetic
Amber Brown, Amount: $2994.94, Category: Cosmetic


In [18]:
# Find the row with the largest transaction amount (Women's)
largest_transaction_row = wft[wft['Transaction Amount'] == wft['Transaction Amount'].max()]

# Access the 'Name' and 'Surname' columns in that row
first_name = largest_transaction_row['Name'].values[0]
last_name = largest_transaction_row['Surname'].values[0]
the_category = largest_transaction_row['Category'].values[0]

# n = int(input("How many people would you like to see?: "))

# Iterate through the top 10 rows and print information for each row
for index, row in wmt.head(10).iterrows():
    transaction_amount = row['Transaction Amount']
    name = row['Name']
    surname = row['Surname']
    category = row['Category']
    print(f"{name} {surname}, Amount: ${transaction_amount}, Category: {category}")


Sean Rodriguez, Amount: $2999.88, Category: Cosmetic
Jacob Williams, Amount: $2999.22, Category: Clothing
Nathan Snyder, Amount: $2998.51, Category: Cosmetic
Thomas Shaw, Amount: $2997.11, Category: Cosmetic
Denise Porter, Amount: $2996.85, Category: Restaurant
Kimberly Ball, Amount: $2996.42, Category: Travel
Hunter Peters, Amount: $2996.2, Category: Market
Julie Russell, Amount: $2995.79, Category: Cosmetic
Christopher Harris, Amount: $2995.32, Category: Cosmetic
Amber Brown, Amount: $2994.94, Category: Cosmetic


### Graphs