# Cleaning Data
If you want to type along with me, use [this notebook](https://humboldt.cloudbank.2i2c.cloud/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2Fbethanyj0%2Fdata271_sp25&branch=main&urlpath=tree%2Fdata271_sp25%2Flectures%2Fdata271_lec27_live.ipynb) instead. 
If you don't want to type and want to follow along just by executing the cells, stay in this notebook. 

In [None]:
import numpy as np
import pandas as pd

In [None]:
# import the youtube data from last time
df = pd.read_csv('humboldt_youtube_2025.csv', dtype={'viewCount': 'object', 'likeCount':'object'})
df.head()

## Cleaning the data 

In [None]:
# Check the column names


In [None]:
# Inspect some questions


In [None]:
# the two columns above are identical


In [None]:
# Check for any columns with just one value


In [None]:
# Save some relevant data, then remove redundant columns
channel_id = 
chanel_title = 

In [None]:
df.head()

### Reformat column names
Our columns names are in different formats. Let's fix that.

In [None]:
# OPTION 1 .rename method


In [None]:
# does not update original (use inline=True)
df.head()

In [None]:
# OPTION 2: more general approach that would work for many columns
# Split column names by words
import re
split_by_words = [re.split('(?=[A-Z])',i) for i in df.columns]
split_by_words

In [None]:
# Insert an underscore between words
columns_with_underscores = ['_'.join(i).lower() for i in split_by_words]
columns_with_underscores

In [None]:
# Reassign column names
df.columns = columns_with_underscores

In [None]:
df.head()

### Convert datatypes

In [None]:
# Suppose we wanted to look at the relationship between view counts and like counts
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
sns.scatterplot(data = df, x='view_count',y='like_count')
plt.show()

In [None]:
# Something is funky. Check the data types


In [None]:
# convert the data types
df = 

In [None]:
# check data types again
df.dtypes

In [None]:
# plot again
sns.scatterplot(data = df, x='view_count',y='like_count')
plt.show()

In [None]:
# What if we wanted to plot trend in view counts?
sns.lineplot(data = df, x='published_at',y='view_count')
plt.show()

In [None]:
# change dates to datetime data
df['published_at'] = 
df.head()

In [None]:
# Check dtypes again
df.dtypes

In [None]:
# plot again
plt.figure(figsize = (10,4))
sns.lineplot(data = df, x='published_at',y='view_count')
plt.show()

#### More on working with dates
Pandas datetime types can be interpreted in multiple formats

In [None]:
date1 = 'April 5, 2025'
pd.to_datetime(date1, format = )

In [1]:
date2 = 'Apr 5, 2025'
pd.to_datetime(date2, format = )

NameError: name 'pd' is not defined

In [None]:
date3 = '4/5/25'
pd.to_datetime(date3, format = )

In [None]:
date4 = '5-4-25'
pd.to_datetime(date4, format = )

In [None]:
date5 = '4-5-2025'
pd.to_datetime(date5, format = )

In [None]:
date6 = '4-05-2025'
pd.to_datetime(date5, format = )

Use [this reference](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior) for more info about datetime formats. 

### Reset the index

In [None]:
# Reset the index for easy access by video id
df.set_index('video_id',inplace=True)
df.head()

In [None]:
# index by video id
video_id = "PoGnLtH1UHY"
df.loc[video_id]

In [None]:
df.loc[df.view_count.idxmax()]

### Reorder the data

In [None]:
# reorder rows
df.sort_index(axis=0).head()

In [None]:
# doesn't update original
df.head()

In [None]:
# reorder columns
df.sort_index(axis=1).head()

In [None]:
# Place columns in your chosen order
df[['view_count','like_count','published_at','title']].head()

In [None]:
# sort the data 
df.sort_values(by='view_count').head()

In [None]:
# doesn't update original
df.head()

In [None]:
# do it inplace
df.sort_values(by = 'view_count',ascending=False, inplace=True)

In [None]:
df.head()

In [None]:
# sort by multiple values at once
df.sort_values(by = ['view_count','like_count'],ascending=[False,True], inplace=True)
df.head()

### Reshaping data

In [None]:
df_weather_wide = pd.read_csv('sample_weather.csv')
df_weather_wide = df_weather_wide.iloc[:,1:]
df_weather_wide

In [None]:
# transpose with more informative columns
date_index = 
date_index

In [None]:
# Make a long series
long_df = 
long_df

In [None]:
long_df = 
long_df

In [None]:
long_df.rename(columns = {'level_1':'variable'},inplace=True)
long_df

In [None]:
# If our columns had a name


In [None]:
date_index

In [None]:
# Now renaming isn't necessary


In [None]:
# another way
df_weather_wide

In [None]:
# another way
long_df = 
long_df

In [None]:
# change long format back into wide format
long_df.pivot(index = 'date',columns = 'variable',values='value')

### What do when there are multiple values in categories 

In [None]:
long_df = pd.read_csv('long_data.csv')
long_df = long_df.iloc[:,1:]
long_df.head()

In [None]:
# Pivot the data to get average sales by date and category


In [None]:
# Pivot the data to get TOTAL sales by date and category
wide_df = long_df.pivot_table(index=['date'], columns='category', values=['sales'], aggfunc=sum)
wide_df

In [None]:
# Pivot the data to get TOTAL sales by date, product, and category
long_df.pivot_table(index='date', columns=['category','product'], values=['sales'], aggfunc=sum)

In [None]:
# Go from wide to long
wide_df.reset_index().melt(id_vars='date', var_name=['type','category'])

## Activity

In [None]:

# Create a DataFrame with data cleaning and reshaping opportunities
data = {
    'Pet Name': ['Fluffy', 'Whiskers', 'Bubbles', 'Spike', 'Coco', 'Maybelle', 'Snowball'],
    'Date Adopted': ['10-01-2023','03-04-2024','01-10-2024','02-14-2024','11-22-2023','01-04-2024','12-25-2025'],
    'Animal Type': ['Cat', 'Cat', 'Fish', 'Dog', 'Fish', 'Dog', 'Cat'],
    'Pet Age': ['3', '2', '13', '5', '4', '3', '2'],
    'Color': ['White', 'Gray', 'Orange', 'White', 'White', 'Black', 'Black'],
    'Happiness Level': ['High', 'Medium', 'High', 'Low', 'High', 'High', 'Medium']
}
df_pets = pd.DataFrame(data)
df_pets

**Activity 1:** Rename the columns of the pets dataframe to be in a better format.

**Activity 2:** Change any datatypes that should be adjusted.  

**Activity 3:** Practice pivoting the dataframe.