In [None]:
import pandas as pd

In [None]:
print("=" * 60)
print("TASK 1: Reading and Displaying Data")
print("=" * 60)

TASK 1: Reading and Displaying Data


In [None]:
# Read the CSV file into the Dataframe
fulldata_df = pd.read_csv("/content/online_store_customer_data.csv")

In [None]:
# Display the first 10 rows of data frame using function head()
print("\nFirst 10 rows using head(10):\n")
print(fulldata_df.head(10))


First 10 rows using head(10):

  Transaction_date  Transaction_ID  Gender   Age Marital_status  State_names  \
0         1/1/2019          151200  Female  19.0         Single       Kansas   
1         1/1/2019          151201    Male  49.0         Single     Illinois   
2         1/1/2019          151202    Male  63.0        Married   New Mexico   
3         1/1/2019          151203     NaN  18.0         Single     Virginia   
4         1/1/2019          151204    Male  27.0         Single  Connecticut   
5         1/3/2019          151205    Male  71.0         Single       Hawaii   
6         1/3/2019          151206  Female  34.0        Married   New Mexico   
7         1/3/2019          151207    Male  37.0        Married  Connecticut   
8         1/4/2019          151208    Male  75.0        Married      Florida   
9         1/4/2019          151209  Female  41.0        Married      Vermont   

    Segment Employees_status Payment_method  Referal  Amount_spent  
0     Basic     Un

In [None]:
# Display last 10 rows of data frame using function tail()
print("\nLast 10 rows using tail(10):\n")
print(fulldata_df.tail(10))


Last 10 rows using tail(10):

     Transaction_date  Transaction_ID  Gender   Age Marital_status  \
2502        4/29/2021          153690    Male  16.0         Single   
2503        4/29/2021          153691    Male  67.0         Single   
2504        4/30/2021          153692    Male  34.0        Married   
2505        4/30/2021          153693    Male  54.0        Married   
2506        4/30/2021          153694    Male  34.0         Single   
2507         5/1/2021          153695  Female  57.0         Single   
2508         5/1/2021          153696  Female  36.0        Married   
2509         5/1/2021          153697    Male  22.0         Single   
2510         5/1/2021          153698     NaN  44.0         Single   
2511         5/1/2021          153699    Male  48.0         Single   

         State_names   Segment Employees_status Payment_method  Referal  \
2502       Louisiana     Basic    self-employed         PayPal      0.0   
2503        Colorado     Basic        Employees 

In [None]:
# ============================================
# ALSO USEFUL TO KNOW: Get DataFrame Metadata
# ============================================
# Before you work with data, you MUST understand its structure
# This is essential for data-driven AI projects!

print("\nDataFrame Info:")

# .shape returns a tuple (rows, columns)
# [0] gets the first element (number of rows)

print(f"Total rows: {fulldata_df.shape[0]}")

# This tells you how many customer records you have

# [1] gets the second element (number of columns)

print(f"Total columns: {fulldata_df.shape[1]}")

# Output: 11 - This tells you how many features/attributes each customer has

# .columns gives you all column names
# list() converts it to a Python list for easy reading

print(f"\nColumn names: {list(fulldata_df.columns)}")

# Output: ['Transaction_date', 'Transaction_ID', 'Gender', 'Age', 'Marital_status',
#          'State_names', 'Segment', 'Employees_status', 'Payment_method', 'Referal', 'Amount_spent']

# .dtypes shows the data type of each column
# This is CRITICAL for AI - you need to know which columns have numbers vs text

print(f"\nData types:\n{fulldata_df.dtypes}")

# Output example:
# Transaction_date      object (text/string)
# Transaction_ID         int64 (integer - whole numbers)
# Gender                object (text/string - categorical)
# Age                 float64 (decimal numbers)
# Marital_status       object (text/string - categorical)
# State_names          object (text/string - categorical)
# Segment              object (text/string - categorical)
# Employees_status     object (text/string - categorical)
# Payment_method       object (text/string - categorical)
# Referal             float64 (decimal numbers)
# Amount_spent        float64 (decimal numbers)

# WHY THIS MATTERS FOR AI:
# - Numerical columns (int64, float64) can go directly into ML algorithms
# - Categorical columns (object) need to be converted to numbers first (encoding)
# - This is part of data preprocessing!



DataFrame Info:
Total rows: 2512
Total columns: 11

Column names: ['Transaction_date', 'Transaction_ID', 'Gender', 'Age', 'Marital_status', 'State_names', 'Segment', 'Employees_status', 'Payment_method', 'Referal', 'Amount_spent']

Data types:
Transaction_date     object
Transaction_ID        int64
Gender               object
Age                 float64
Marital_status       object
State_names          object
Segment              object
Employees_status     object
Payment_method       object
Referal             float64
Amount_spent        float64
dtype: object


In [None]:
print("\n" + "=" * 60)
print("TASK 2: Working with Columns")
print("=" * 60)

# Part A: Select column "Age" as a DataFrame (NOT a Series)
age_df = fulldata_df[['Age']]  # Double brackets = DataFrame
print("\nPart A - Age column as DataFrame:")
print(f"Type: {type(age_df)}")
print(f"Shape: {age_df.shape}")
print(age_df.head())

personal_df = fulldata_df[['Age', 'Gender', 'Marital_status']]
print("\nPart B - Personal information as DataFrame:")
print(f"Type: {type(personal_df)}")
print(f"Shape: {personal_df.shape}")
print(personal_df.head())

# Part B: Reorder columns AND rename them using rename() with inplace=True
# Step 1: Reorder the columns by selecting them in the desired order

personal_df = personal_df[['Gender', 'Marital_status', 'Age']]

# Step 2: Rename the columns using the rename() method

personal_df.rename(columns={'Gender': 'gender', 'Marital_status': 'marital status', 'Age': 'age'}, inplace=True)

print("\nPart C - Reordered and renamed columns:")
print(personal_df.head())

# Step 3: Select 4 columns, then delete 2 of them

selected_cols = fulldata_df[['Employees_status', 'Payment_method', 'Referal', 'Amount_spent']]
print("\nStep 3 - Selected columns:")
print(selected_cols.head())

del selected_cols['Employees_status']
del selected_cols['Payment_method']
print("\nStep 4 - Deleted columns:")
print(selected_cols.head())


TASK 2: Working with Columns

Part A - Age column as DataFrame:
Type: <class 'pandas.core.frame.DataFrame'>
Shape: (2512, 1)
    Age
0  19.0
1  49.0
2  63.0
3  18.0
4  27.0

Part B - Personal information as DataFrame:
Type: <class 'pandas.core.frame.DataFrame'>
Shape: (2512, 3)
    Age  Gender Marital_status
0  19.0  Female         Single
1  49.0    Male         Single
2  63.0    Male        Married
3  18.0     NaN         Single
4  27.0    Male         Single

Part C - Reordered and renamed columns:
   gender marital status   age
0  Female         Single  19.0
1    Male         Single  49.0
2    Male        Married  63.0
3     NaN         Single  18.0
4    Male         Single  27.0

Step 3 - Selected columns:
  Employees_status Payment_method  Referal  Amount_spent
0     Unemployment          Other      1.0       2051.36
1    self-employed           Card      0.0        544.04
2          workers         PayPal      1.0       1572.60
3          workers           Card      1.0       11

In [None]:
print("\n" + "=" * 60)
print("TASK 3: Working with Rows (Filtering Data)")
print("=" * 60)

# Step 1: Show all single employees in dataframe

single_employees_df = fulldata_df[fulldata_df['Marital_status'] == 'Single']
print("\nStep 1 - Single employees:")
print(single_employees_df)

connecticut_employees_df = fulldata_df[fulldata_df['State_names'] == 'Connecticut']
print("\nStep 2 - Connecticut employees:")
print(connecticut_employees_df)

above_1000_spent_df = fulldata_df[fulldata_df['Amount_spent'] > 1000]
print("\nStep 3 - Customers who spent more than $1000:")
print(above_1000_spent_df)

older_than_30_df = fulldata_df[fulldata_df['Age'] > 30]
print("\nStep 4 - Customers older than 30:")
print(older_than_30_df)

mixed_filter_df = fulldata_df[fulldata_df['Amount_spent'] > 1500][['Age', 'Marital_status', 'Gender']]
print(f"\nFilter 5 - Age, Marital_status, Gender for employees spending > $1500:")
print(f"Total Records: {len(mixed_filter_df)}")
print(mixed_filter_df)


TASK 3: Working with Rows (Filtering Data)

Step 1 - Single employees:
     Transaction_date  Transaction_ID  Gender   Age Marital_status  \
0            1/1/2019          151200  Female  19.0         Single   
1            1/1/2019          151201    Male  49.0         Single   
3            1/1/2019          151203     NaN  18.0         Single   
4            1/1/2019          151204    Male  27.0         Single   
5            1/3/2019          151205    Male  71.0         Single   
...               ...             ...     ...   ...            ...   
2506        4/30/2021          153694    Male  34.0         Single   
2507         5/1/2021          153695  Female  57.0         Single   
2509         5/1/2021          153697    Male  22.0         Single   
2510         5/1/2021          153698     NaN  44.0         Single   
2511         5/1/2021          153699    Male  48.0         Single   

         State_names   Segment Employees_status Payment_method  Referal  \
0           

In [None]:
print("\nFilter 6 - States ranked by total amount spent (Highest to Lowest):")
state_spending = fulldata_df[['State_names', 'Amount_spent']]
state_spending_sorted = state_spending.sort_values('Amount_spent', ascending=False)
print(state_spending_sorted.head(15))


Filter 6 - States ranked by total amount spent (Highest to Lowest):
         State_names  Amount_spent
17        New Mexico       2999.98
485          Arizona       2998.62
2279         Arizona       2997.21
589   North Carolina       2997.15
743         Illinois       2996.82
2367    Pennsylvania       2995.73
101         Virginia       2989.33
1254          Nevada       2988.13
1123   West Virginia       2987.96
1177        Michigan       2985.70
2136        Nebraska       2984.37
2445      New Jersey       2982.88
1079        Maryland       2981.36
1105    Rhode Island       2979.76
501          Montana       2978.21


In [None]:
print("\n" + "=" * 60)
print("SLICING: loc vs iloc")
print("=" * 60)

# Example 1: Select rows 0-10 and columns "Age" and "Gender" using loc
print("\nSlice 1 - loc[0:10, ['Age', 'Gender']] (rows 0-10, Age & Gender columns):")
slice1 = fulldata_df.loc[0:10, ['Age', 'Gender']]
print(slice1)

# Example 2: Same thing using iloc (integer positions)
print("\nSlice 2 - iloc[0:10, [3, 2]] (rows 0-10, columns at positions 3 & 2):")
slice2 = fulldata_df.iloc[0:10, [3, 2]]
print(slice2)

# Example 3: Rows 10-30, all columns using loc

print("\nSlice 3 - loc[10:30, :] (rows 10-30, all columns):")
slice3 = fulldata_df.loc[10:30, :]
print(slice3)

# Example 4: Rows 10-30, specific columns using loc

print("\nSlice 4 - loc[10:30, ['Age', 'Gender', 'Amount_spent']] (rows 10-3)")
slice4 = fulldata_df.loc[10:30, ['Age', 'Gender', 'Amount_spent']]
print(slice4)

# Example 5: All rows except first 5, specific columns

print("\nSlice 5 - loc[:, ['State_names', 'Segment', 'Employee_status', 'Payment_method']] (all rows except first 5)")
slice5 = fulldata_df.loc[:, ['State_names', 'Segment', 'Employees_status', 'Payment_method']].iloc[5:]
print(slice5)


SLICING: loc vs iloc

Slice 1 - loc[0:10, ['Age', 'Gender']] (rows 0-10, Age & Gender columns):
     Age  Gender
0   19.0  Female
1   49.0    Male
2   63.0    Male
3   18.0     NaN
4   27.0    Male
5   71.0    Male
6   34.0  Female
7   37.0    Male
8   75.0    Male
9   41.0  Female
10  56.0  Female

Slice 2 - iloc[0:10, [3, 2]] (rows 0-10, columns at positions 3 & 2):
    Age  Gender
0  19.0  Female
1  49.0    Male
2  63.0    Male
3  18.0     NaN
4  27.0    Male
5  71.0    Male
6  34.0  Female
7  37.0    Male
8  75.0    Male
9  41.0  Female

Slice 3 - loc[10:30, :] (rows 10-30, all columns):
   Transaction_date  Transaction_ID  Gender   Age Marital_status  \
10         1/4/2019          151210  Female  56.0        Married   
11         1/5/2019          151211  Female  63.0        Married   
12         1/5/2019          151212  Female  60.0        Married   
13         1/5/2019          151213    Male  47.0         Single   
14         1/5/2019          151214    Male  24.0        Mar