# Data Prep Assignments

In [1]:
import pandas as pd

## Assignment 1: Set the Correct Row Granularity

In [2]:
# 1. Read the Excel spreadsheet into a Pandas DataFrame
df_raw = pd.read_excel('../Data/entertainment.xlsx', sheet_name='entertainment')
df_raw.head()

Unnamed: 0,name,entertainment,hours_per_week
0,Emily,video_games,5.1
1,Liam,video_games,4.9
2,Olivia,video_games,4.7
3,Noah,video_games,4.6
4,Ava,video_games,5.0


In [3]:
# 2. Check the number of rows and columns
df_raw.shape

(600, 3)

In [4]:
# 3. Determine the row granularity needed
df_raw.entertainment.value_counts()

entertainment
video_games    150
tv_shows       150
movies         150
books          150
Name: count, dtype: int64

In [5]:
df_raw[df_raw.name == 'Emily']

Unnamed: 0,name,entertainment,hours_per_week
0,Emily,video_games,5.1
150,Emily,tv_shows,5.2
300,Emily,movies,1.4
450,Emily,books,0.5


In [6]:
# 4. Apply the correct DataFrame transformation
df = (df_raw.pivot(index='name',
                   columns='entertainment',
                   values='hours_per_week')
            .fillna(0)
            .reset_index())
df.columns.name = ''  # Remove the index column name
df

Unnamed: 0,name,books,movies,tv_shows,video_games
0,Aaliyah,0.5,1.5,4.6,4.9
1,Abigail,0.0,1.4,4.5,4.8
2,Addison,0.5,1.6,4.5,5.0
3,Adeline,3.5,4.4,4.5,6.6
4,Alana,2.8,3.9,3.8,5.6
...,...,...,...,...,...
145,Winifred,5.2,5.4,4.6,6.9
146,Xanthe,6.0,5.6,4.6,6.7
147,Zara,5.5,6.7,5.7,7.7
148,Zoe,0.0,1.5,6.1,5.2


In [7]:
# Check for Nan values
df.isnull().sum() 


name           0
books          0
movies         0
tv_shows       0
video_games    0
dtype: int64

In [8]:
# 5. Save the transformation as a new DataFrame
# done in step #4

In [9]:
# 6. Check the number of rows and columns
df.shape

(150, 5)

In [10]:
df.head()

Unnamed: 0,name,books,movies,tv_shows,video_games
0,Aaliyah,0.5,1.5,4.6,4.9
1,Abigail,0.0,1.4,4.5,4.8
2,Addison,0.5,1.6,4.5,5.0
3,Adeline,3.5,4.4,4.5,6.6
4,Alana,2.8,3.9,3.8,5.6


## Assignment 2: Prepare Columns for Modeling

In [11]:
# 1. Find the missing values
df.isnull().sum() 


name           0
books          0
movies         0
tv_shows       0
video_games    0
dtype: int64

In [13]:
# print out rows with missing values
df[df.isnull().any(axis=1)]

Unnamed: 0,name,books,movies,tv_shows,video_games


In [12]:
# 2. Fill in the missing values with zeros
df = df.fillna(0)
df

Unnamed: 0,name,books,movies,tv_shows,video_games
0,Aaliyah,0.5,1.5,4.6,4.9
1,Abigail,0.0,1.4,4.5,4.8
2,Addison,0.5,1.6,4.5,5.0
3,Adeline,3.5,4.4,4.5,6.6
4,Alana,2.8,3.9,3.8,5.6
...,...,...,...,...,...
145,Winifred,5.2,5.4,4.6,6.9
146,Xanthe,6.0,5.6,4.6,6.7
147,Zara,5.5,6.7,5.7,7.7
148,Zoe,0.0,1.5,6.1,5.2


In [14]:
# 3. Create a new column called video_game_lover for people who played more than 7 hours of video games
import numpy as np
df['video_game_lover'] = np.where(df['video_games'] > 7, 1, 0)
df 

Unnamed: 0,name,books,movies,tv_shows,video_games,video_game_lover
0,Aaliyah,0.5,1.5,4.6,4.9,0
1,Abigail,0.0,1.4,4.5,4.8,0
2,Addison,0.5,1.6,4.5,5.0,0
3,Adeline,3.5,4.4,4.5,6.6,0
4,Alana,2.8,3.9,3.8,5.6,0
...,...,...,...,...,...,...
145,Winifred,5.2,5.4,4.6,6.9,0
146,Xanthe,6.0,5.6,4.6,6.7,0
147,Zara,5.5,6.7,5.7,7.7,1
148,Zoe,0.0,1.5,6.1,5.2,0


## Assignment 3: Feature Engineering

In [16]:
df.head()

Unnamed: 0,name,books,movies,tv_shows,video_games,video_game_lover
0,Aaliyah,0.5,1.5,4.6,4.9,0
1,Abigail,0.0,1.4,4.5,4.8,0
2,Addison,0.5,1.6,4.5,5.0,0
3,Adeline,3.5,4.4,4.5,6.6,0
4,Alana,2.8,3.9,3.8,5.6,0


In [17]:
# 1. Create a column called total_entertainment that sums up all the types of entertainment for each student
df['total_entertainment'] = df['books'] + df['movies'] + df['tv_shows'] + df['video_games']
df.head()

Unnamed: 0,name,books,movies,tv_shows,video_games,video_game_lover,total_entertainment
0,Aaliyah,0.5,1.5,4.6,4.9,0,11.5
1,Abigail,0.0,1.4,4.5,4.8,0,10.7
2,Addison,0.5,1.6,4.5,5.0,0,11.6
3,Adeline,3.5,4.4,4.5,6.6,0,19.0
4,Alana,2.8,3.9,3.8,5.6,0,16.1


In [19]:
# 2. Create a column called pct_screen that calculates the percent of entertainment that’s on screens (everything except for books) for each student
df['pct_screen'] = (df['movies'] + df['tv_shows'] + df['video_games']) / df['total_entertainment']
df.head()

Unnamed: 0,name,books,movies,tv_shows,video_games,video_game_lover,total_entertainment,pct_screen
0,Aaliyah,0.5,1.5,4.6,4.9,0,11.5,0.956522
1,Abigail,0.0,1.4,4.5,4.8,0,10.7,1.0
2,Addison,0.5,1.6,4.5,5.0,0,11.6,0.956897
3,Adeline,3.5,4.4,4.5,6.6,0,19.0,0.815789
4,Alana,2.8,3.9,3.8,5.6,0,16.1,0.826087


## Assignment 4: Feature Selection

In [22]:
# 1. Save the student name column of the DataFrame as its own Series for reference
names = df['name']
names

0       Aaliyah
1       Abigail
2       Addison
3       Adeline
4         Alana
         ...   
145    Winifred
146      Xanthe
147        Zara
148         Zoe
149        Zoey
Name: name, Length: 150, dtype: object

In [24]:
# 2. Save the three new columns of the DataFrame as its own DataFrame for modeling – video_game_lover, total_entertainment and pct_screen
df_model = df[['video_game_lover', 'total_entertainment', 'pct_screen']]
df_model.head()

Unnamed: 0,video_game_lover,total_entertainment,pct_screen
0,0,11.5,0.956522
1,0,10.7,1.0
2,0,11.6,0.956897
3,0,19.0,0.815789
4,0,16.1,0.826087


## Assignment 5: Feature Scaling

In [25]:
# 1. Scale the features in the modeling DataFrame so they all have a mean of 0 and a standard deviation of 1
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()
standardized = std_scaler.fit_transform(df_model)
pd.DataFrame(standardized, columns=df_model.columns)

Unnamed: 0,video_game_lover,total_entertainment,pct_screen
0,-0.294884,-1.360056,1.384262
1,-0.294884,-1.551591,1.918235
2,-0.294884,-1.336114,1.388865
3,-0.294884,0.435582,-0.344126
4,-0.294884,-0.258731,-0.217658
...,...,...,...
145,-0.294884,1.177779,-0.971503
146,-0.294884,1.369314,-1.299596
147,3.391165,2.015743,-0.720344
148,-0.294884,-1.048812,1.918235


In [26]:
# 2. Save the output as a final DataFrame that’s ready for modeling
df_std = pd.DataFrame(standardized, columns=df_model.columns)
df_std.describe()

Unnamed: 0,video_game_lover,total_entertainment,pct_screen
count,150.0,150.0,150.0
mean,-5.77316e-17,-1.000681e-15,-7.460699e-16
std,1.00335,1.00335,1.00335
min,-0.2948839,-1.743126,-1.574103
25%,-0.2948839,-1.018885,-0.7387959
50%,-0.2948839,0.0884252,-0.3422683
75%,-0.2948839,0.7109131,1.112733
max,3.391165,2.015743,1.918235


In [27]:
# 3. Optional: pickle the dataframe for modeling (save to a file)
df_std.to_pickle('ent_data_for_modeling.pkl')