<a href="https://colab.research.google.com/github/mercy09/ijeoma/blob/main/data_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries if not already available
!pip install pandas
!pip install numpy


In [None]:
# Importing the required libraries
import pandas as pd
import numpy as np

In [None]:
# Data Preparation Stage 1: Data Collection
# Create a simulated dataset for a restaurant
data = {
    'CustomerID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'OrderValue': [30.5, np.nan, 45.0, 20.0, 55.0, 60.0, -5, 70.0, 80.0, 90.0],
    'Feedback': ['Good', 'Very Good', None, 'Average', 'Excellent', '', 'Poor', 'Good', 'Very Good', 'Excellent']
     }


In [None]:
# Creating a DataFrame
df = pd.DataFrame(data)
#df = pd.read_csv('');

#Initial DataFrame
print(df)

   CustomerID  OrderValue   Feedback
0           1        30.5       Good
1           2         NaN  Very Good
2           3        45.0       None
3           4        20.0    Average
4           5        55.0  Excellent
5           6        60.0           
6           7        -5.0       Poor
7           8        70.0       Good
8           9        80.0  Very Good
9          10        90.0  Excellent


In [None]:
df.describe()

Unnamed: 0,CustomerID,OrderValue
count,10.0,9.0
mean,5.5,49.5
std,3.02765,30.317074
min,1.0,-5.0
25%,3.25,30.5
50%,5.5,55.0
75%,7.75,70.0
max,10.0,90.0


In [None]:
# Data Cleaning - Handling Missing Values
# Filling missing values in 'OrderValue' with the mean, and dropping rows with NaN in 'Feedback'
# Fill missing OrderValue with mean
order_value_mean = df['OrderValue'].mean()
df['OrderValue'] = df['OrderValue'].fillna(order_value_mean)

# Replace empty strings with NaN in Feedback
df['Feedback'] = df['Feedback'].replace('', np.nan)


print(df)

   CustomerID  OrderValue   Feedback
0           1        30.5       Good
1           2        49.5  Very Good
2           3        45.0       None
3           4        20.0    Average
4           5        55.0  Excellent
5           6        60.0        NaN
6           7        -5.0       Poor
7           8        70.0       Good
8           9        80.0  Very Good
9          10        90.0  Excellent


In [None]:
# Drop rows where Feedback is NaN
df = df.dropna(subset=['Feedback'])

In [None]:
# Replace empty strings with NaN in Feedback
df['Feedback'] = df['Feedback'].replace('', np.nan)

# Drop rows where Feedback is NaN
df = df.dropna(subset=['Feedback'])
print(df)

   CustomerID  OrderValue   Feedback
0           1        30.5       Good
1           2        49.5  Very Good
3           4        20.0    Average
4           5        55.0  Excellent
6           7        -5.0       Poor
7           8        70.0       Good
8           9        80.0  Very Good
9          10        90.0  Excellent


In [None]:
# Data Cleaning - Handling Negative Values
# We will replace negative 'OrderValue' with 0, as negative order values are not valid
df.loc[df['OrderValue'] < 0, 'OrderValue'] = 0

print("\nDataFrame after Cleaning:")
print(df)


DataFrame after Cleaning:
   CustomerID  OrderValue   Feedback
0           1        30.5       Good
1           2        49.5  Very Good
3           4        20.0    Average
4           5        55.0  Excellent
6           7         0.0       Poor
7           8        70.0       Good
8           9        80.0  Very Good
9          10        90.0  Excellent


In [None]:
#Data Transformation - Creating a New Column for Feedback Score
# Mapping feedback text to their respective scores (1 to 5 scale)
feedback_mapping = {'Poor': 1, 'Average': 2, 'Good': 3, 'Very Good': 4, 'Excellent': 5}
df['FeedbackScore'] = df['Feedback'].map(feedback_mapping)
print(df)

   CustomerID  OrderValue   Feedback  FeedbackScore
0           1        30.5       Good              3
1           2        49.5  Very Good              4
3           4        20.0    Average              2
4           5        55.0  Excellent              5
6           7         0.0       Poor              1
7           8        70.0       Good              3
8           9        80.0  Very Good              4
9          10        90.0  Excellent              5


In [None]:
#Data Transformation - Normalizing 'OrderValue' to a 0-1 scale
df['NormalizedOrderValue'] = (df['OrderValue'] - df['OrderValue'].min()) / (df['OrderValue'].max() - df['OrderValue'].min())
print(df)

   CustomerID  OrderValue   Feedback  FeedbackScore  NormalizedOrderValue
0           1        30.5       Good              3              0.338889
1           2        49.5  Very Good              4              0.550000
3           4        20.0    Average              2              0.222222
4           5        55.0  Excellent              5              0.611111
6           7         0.0       Poor              1              0.000000
7           8        70.0       Good              3              0.777778
8           9        80.0  Very Good              4              0.888889
9          10        90.0  Excellent              5              1.000000


In [None]:
print("\nDataFrame after Transformation:")
print(df)


DataFrame after Transformation:
   CustomerID  OrderValue   Feedback  FeedbackScore  NormalizedOrderValue
0           1        30.5       Good              3              0.338889
1           2        49.5  Very Good              4              0.550000
3           4        20.0    Average              2              0.222222
4           5        55.0  Excellent              5              0.611111
6           7         0.0       Poor              1              0.000000
7           8        70.0       Good              3              0.777778
8           9        80.0  Very Good              4              0.888889
9          10        90.0  Excellent              5              1.000000


In [None]:
#Data Validation - Validating Data Types
print("\nData Types:")
print(df.dtypes)


Data Types:
CustomerID                int64
OrderValue              float64
Feedback                 object
FeedbackScore             int64
NormalizedOrderValue    float64
dtype: object


In [None]:
# Data validation - Checking for Duplicates
duplicates = df.duplicated()
print("\nDuplicate Rows:")
print(df[duplicates])


Duplicate Rows:
Empty DataFrame
Columns: [CustomerID, OrderValue, Feedback, FeedbackScore, NormalizedOrderValue]
Index: []


In [None]:
#Summary Statistics
summary_stats = df.describe()
print("\nSummary Statistics:")
print(summary_stats)


Summary Statistics:
       CustomerID  OrderValue  FeedbackScore  NormalizedOrderValue
count    8.000000    8.000000       8.000000              8.000000
mean     5.750000   49.375000       3.375000              0.548611
std      3.284161   30.943208       1.407886              0.343813
min      1.000000    0.000000       1.000000              0.000000
25%      3.500000   27.875000       2.750000              0.309722
50%      6.000000   52.250000       3.500000              0.580556
75%      8.250000   72.500000       4.250000              0.805556
max     10.000000   90.000000       5.000000              1.000000


In [None]:
# Saving the prepared DataFrame to a CSV file
df.to_csv('prepared_restaurant_data.csv', index=False)
print("\nPrepared data saved to 'prepared_restaurant_data.csv'.")