In [63]:
import pandas as pd
import numpy as np
import sys
sys.path.append("../")

import src.support as sp

In [64]:
df = pd.read_csv("../data/the_office_series.csv", index_col = 0)

In [65]:
df.head(2)

Unnamed: 0,Season,EpisodeTitle,About,Ratings,Votes,Viewership,Duration,Date,GuestStars,Director,Writers
0,1,Pilot,The premiere episode introduces the boss and s...,7.5,4936,11.2,23,24 March 2005,,Ken Kwapis,Ricky Gervais |Stephen Merchant and Greg Daniels
1,1,Diversity Day,Michael's off color remark puts a sensitivity ...,8.3,4801,6.0,23,29 March 2005,,Ken Kwapis,B. J. Novak


In [66]:
# Function that prints the shape, columns, missing values, data types, head, and describe of a dataframe
sp.analyze_dataframe(df)

Shape: (188, 11)

Columns: Index(['Season', 'EpisodeTitle', 'About', 'Ratings', 'Votes', 'Viewership',
       'Duration', 'Date', 'GuestStars', 'Director', 'Writers'],
      dtype='object')

Missing values: Season            0
EpisodeTitle      0
About             0
Ratings           0
Votes             0
Viewership        0
Duration          0
Date              0
GuestStars      159
Director          0
Writers           0
dtype: int64

Data types: Season            int64
EpisodeTitle     object
About            object
Ratings         float64
Votes             int64
Viewership      float64
Duration          int64
Date             object
GuestStars       object
Director         object
Writers          object
dtype: object

Head:    Season   EpisodeTitle                                              About  \
0       1          Pilot  The premiere episode introduces the boss and s...   
1       1  Diversity Day  Michael's off color remark puts a sensitivity ...   
2       1    Health Care 

In [67]:
df["Date"] = pd.to_datetime(df["Date"])
df.head(1)

Unnamed: 0,Season,EpisodeTitle,About,Ratings,Votes,Viewership,Duration,Date,GuestStars,Director,Writers
0,1,Pilot,The premiere episode introduces the boss and s...,7.5,4936,11.2,23,2005-03-24,,Ken Kwapis,Ricky Gervais |Stephen Merchant and Greg Daniels


In [68]:
df.columns = df.columns.str.lower()
df.head(1)

Unnamed: 0,season,episodetitle,about,ratings,votes,viewership,duration,date,gueststars,director,writers
0,1,Pilot,The premiere episode introduces the boss and s...,7.5,4936,11.2,23,2005-03-24,,Ken Kwapis,Ricky Gervais |Stephen Merchant and Greg Daniels


In [69]:
# Dropping the episodes "Niagara Part 2" and "The Delivery Part 2", to match the episodes that appear 
# in the lines dataframe, where these episodes are merged into one episode
df = df.drop([95, 108])

In [70]:
# Function that takes a dataframe, resets the index, renames the index, resets the index again, and drops the index column
df = sp.reset_and_rename_index(df, "episode_id")

In [71]:
df.head(3)

Unnamed: 0,episode_id,season,episodetitle,about,ratings,votes,viewership,duration,date,gueststars,director,writers
0,0,1,Pilot,The premiere episode introduces the boss and s...,7.5,4936,11.2,23,2005-03-24,,Ken Kwapis,Ricky Gervais |Stephen Merchant and Greg Daniels
1,1,1,Diversity Day,Michael's off color remark puts a sensitivity ...,8.3,4801,6.0,23,2005-03-29,,Ken Kwapis,B. J. Novak
2,2,1,Health Care,Michael leaves Dwight in charge of picking the...,7.8,4024,5.8,22,2005-04-05,,Ken Whittingham,Paul Lieberstein


In [72]:
df = df.rename(columns = {"episodetitle": "episode_title", "gueststars": "guest_stars"})
df.head(1)

Unnamed: 0,episode_id,season,episode_title,about,ratings,votes,viewership,duration,date,guest_stars,director,writers
0,0,1,Pilot,The premiere episode introduces the boss and s...,7.5,4936,11.2,23,2005-03-24,,Ken Kwapis,Ricky Gervais |Stephen Merchant and Greg Daniels


In [73]:
# Creation of a new dataframe called df_writers, where the goal is to know how many episodes every writer has wrote
df_writers = df.assign(writers=df.writers.str.split("|")).explode('writers')
df_writers = df_writers.assign(writers=df_writers.writers.str.split("and")).explode('writers')
df_writers.head(3)

Unnamed: 0,episode_id,season,episode_title,about,ratings,votes,viewership,duration,date,guest_stars,director,writers
0,0,1,Pilot,The premiere episode introduces the boss and s...,7.5,4936,11.2,23,2005-03-24,,Ken Kwapis,Ricky Gervais
0,0,1,Pilot,The premiere episode introduces the boss and s...,7.5,4936,11.2,23,2005-03-24,,Ken Kwapis,Stephen Merchant
0,0,1,Pilot,The premiere episode introduces the boss and s...,7.5,4936,11.2,23,2005-03-24,,Ken Kwapis,Greg Daniels


In [74]:
# Dropping the writers column in the first dataframe, as now we have more precise information in the df_writers dataframe
df = df.drop("writers", axis=1)

In [75]:
# Dropping every other column in the writers dataframe
df_writers = df_writers.drop(df.columns[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]], axis=1)
df_writers.head()

Unnamed: 0,episode_id,writers
0,0,Ricky Gervais
0,0,Stephen Merchant
0,0,Greg Daniels
1,1,B. J. Novak
2,2,Paul Lieberstein


In [76]:
# Replacing all double quotes with single quotes in the about column, just to make sure we don't have errors inserting the 
# data later into SQL
df["about"] = df["about"].str.replace(r"[\"]", r"'", regex = True)

In [77]:
df.to_csv('../data/infoepisodes.csv', index=False)

In [78]:
df_writers.to_csv('../data/writers.csv', index=False)