# Load libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import regex as re
# Display up to 100 rows of data
pd.options.display.min_rows = 100

# Load data

In [2]:
file_path = "episode_level_measures_parts_with_viewership_new_measures.csv"
imdb_file_path = "show_info_pivoted_with_imdb (1).csv"
file_path3 = "Air_Date1.csv"
cancelled_shows_filepath = 'Cancelled coding - NLP Survival Analysis.csv'

nlpdata = pd.read_csv(file_path, sep=',', on_bad_lines='skip', index_col=False, dtype='unicode')
imdb = pd.read_csv(imdb_file_path, sep=',', on_bad_lines='skip', index_col=False, dtype='unicode')
airdate_data = pd.read_csv(file_path3, sep=',', index_col=False, dtype='unicode')
cancelled = pd.read_csv(cancelled_shows_filepath)

## NLP Data

In [3]:
nlpdata.head()

Unnamed: 0,Show,Season,Episode,File Name,sd_div_mean_1,sd_div_mean_2,sd_div_mean_3,sd_sum_1,sd_sum_2,sd_sum_3,...,motion_1,motion_2,motion_3,space_1,space_2,space_3,time_1,time_2,time_3,Viewership (millions)
0,2 Broke Girls,1,1,2 Broke Girls - S01E01 - 2.Broke.Girls.Season....,5658.359420163232,9159.676336,4626.990287281845,18.66514228373773,20.5456727014644,21.69718205504922,...,1.7342857142857144,2.4214285714285717,2.0725,5.192857142857143,7.394285714285714,5.41375,4.322857142857143,3.005714285714286,2.5725,19.37
1,2 Broke Girls,1,2,2 Broke Girls - S01E02 - 2 Broke Girls S01E02 ...,6280.502095337137,3451.804293115786,7127.226693789984,19.733889186635707,20.345822809580056,17.44018977836537,...,1.4857142857142858,2.987142857142857,2.2185714285714284,6.852857142857142,7.992857142857143,6.631428571428572,4.492857142857143,4.335714285714285,3.932857142857143,11.75
2,2 Broke Girls,1,3,2 Broke Girls - S01E03 - 2 Broke Girls S01E03 ...,3931.6040952853023,3977.461758366915,5506.511934027525,21.55941993523026,20.759948099987184,19.837830499174945,...,1.741428571428571,2.361428571428572,5.745,6.055714285714286,6.632857142857143,4.38625,4.8100000000000005,2.487142857142857,3.27375,11.42
3,2 Broke Girls,1,4,2 Broke Girls - S01E04 - 2 Broke Girls S01E04 ...,5899.921656,7097.733100232231,3386.8579172793297,21.26736255478216,20.24850059,19.43430838831536,...,2.347142857142857,3.0485714285714285,2.847142857142857,7.688571428571429,6.484285714285714,6.192857142857142,4.957142857142857,3.7471428571428578,4.097142857142857,10.71
4,2 Broke Girls,1,6,2 Broke Girls - S01E06 - 2 Broke Girls S01E06 ...,9570.497331194108,5883.807212973031,6103.310464,20.229745482690152,21.176402608924068,21.713179209464133,...,1.4257142857142855,1.9,2.015,6.475714285714285,4.26,5.53875,4.502857142857143,4.93,3.62375,11.19


In [4]:
nlpdata.dtypes.head(10)

Show             object
Season           object
Episode          object
File Name        object
sd_div_mean_1    object
sd_div_mean_2    object
sd_div_mean_3    object
sd_sum_1         object
sd_sum_2         object
sd_sum_3         object
dtype: object

In [5]:
# How many TV shows do we have
len(nlpdata.Show.unique())

235

In [6]:
nlpdata.loc[:,'sd_div_mean_1':'time_3'].shape

(26088, 129)

For each show, each episode we have sentiment analysis metrics + viewership (millions) of the debut release.
Each measure appears 3 times corresponding to different acts within an episode. We have 129 measures from the sentiment analysis output / 3 acts correspond to **43 emotions per act**.

***NLP Dictionary***:
- Sd_div_mean: Aims to assess how quickly a consumer becomes immersed in a story—both mentally and emotionally. Become absorbed in the narrative. By dividing the standard deviation of overall emotion scores by the mean, this measurement captures the shifts, variability, and intensity of emotions throughout the experience.
- Sd_sum: Sum of the standard deviation, amount of emotion variance for each act.
- Sd_scaled: Standard deviation scaled across all emotions for ease of comparison.
- Anger
- Surprise
- Disgust
- Sadness
- Neutral
- Fear
- Joy
- Positive
- Negative
- Engaged: High ssychological involvement or emotional investment, including greater use of personal pronouns, more emotional words, and greater cognitive processing contained in the act.
- Not Engaged: Low psychological involvement or emotional investment, including lesser use of personal pronouns, less emotional words, and less cognitive processing contained in the act.
- WC: Total word count contained in the act.
- Analytic: Analytical, formal, or logical discussion contained in the act.
- Clout: Social status, confidence, or leadership discussion contained in the act.
- Authenticity: Honest, non-filtered, non-regulated discussion contained in the act.
- Tone: The higher the tone, the more positive the tone in the act (below 50 is considered negative).
- WPS: Average words per sentence.
- Six letter: Percentage of words longer than six letters.
- Dic: Percentage of words that were captured as dictionary words.
- Cogprocess: An aggregate measurement that looks at the amount of words that reflect active information processing and mental activities, including causation contained in the act.
    - Insight: Considers realizations.
    - Cause: Examines causation between two elements.
    - Discrep: Considers what should, could, or would have happened, but never did, (exploring counterfactuals).
    - Tentative: Looks at whether something could or could not happen (e.g. maybe, perhaps).
    - Certain: Looks at absolute language (e.g. always, never).
    - Differ (Is this differentiate?): Considers differentiation between two elements such as (hasn’t , but, else).
- Perceptual: An aggregate measurement of terms that describe perception, such as look, heard, and feeling.
    - See: Amount of text around viewing or seeing.
    - Hear: Amount of text around hearing or listening.
    - Feel: Looks at references to touch or feeling.
- Drives: An aggregate measurement that looks at different motivations.
    - Affiliation: Looks at relations and affiliations such as ally, friend, or being social.
    - Achieve: Considers the ability to win, earn success, and be better.
    - Power: Examines power dynamics and structures including superiority and bullying.
    - Reward: Examines the types of rewards that are discussed including receiving something, prizes, and benefits.
    - Risk: Examines the different types of dangers and doubts.
- Relativity: Aggregate measure extends toward spatial relationships such as area, bend, and exit.
    - Motion: Examines the ability to move, including arrive, car, and go.
    - Space: Examines directions in space, including down, and in.
    - Time: Examines time durations, including end, until and season.


In [7]:
# Convert Season column to integer and Show column to string
nlpdata['Season'] = nlpdata['Season'].astype(int)
nlpdata['Show'] = nlpdata['Show'].astype(str)

# Identify columns to convert to float (excluding 'Show', 'Season', and 'Episode')
columns_to_convert = nlpdata.columns.difference(['Show', 'Season', 'Episode'])
nlpdata[columns_to_convert] = nlpdata[columns_to_convert].apply(pd.to_numeric, errors='coerce')

# Drop unnecessary columns
nlpdata.drop(columns = ['File Name'], inplace=True)

# Clean 'Episode' column
nlpdata['Episode'] = (
    nlpdata['Episode']
    .str.strip()  # Remove leading/trailing whitespace
    .str.split('-')  # Take the first part before '-'
    .str[0]
    .str.extract(r'(\d+)')[0]  # Keep only digits
    .astype(int)
)

nlpdata.head()

Unnamed: 0,Show,Season,Episode,sd_div_mean_1,sd_div_mean_2,sd_div_mean_3,sd_sum_1,sd_sum_2,sd_sum_3,sd_scaled_1,...,motion_1,motion_2,motion_3,space_1,space_2,space_3,time_1,time_2,time_3,Viewership (millions)
0,2 Broke Girls,1,1,5658.35942,9159.676336,4626.990287,18.665142,20.545673,21.697182,598.633647,...,1.734286,2.421429,2.0725,5.192857,7.394286,5.41375,4.322857,3.005714,2.5725,19.37
1,2 Broke Girls,1,2,6280.502095,3451.804293,7127.226694,19.733889,20.345823,17.44019,629.842904,...,1.485714,2.987143,2.218571,6.852857,7.992857,6.631429,4.492857,4.335714,3.932857,11.75
2,2 Broke Girls,1,3,3931.604095,3977.461758,5506.511934,21.55942,20.759948,19.83783,694.243568,...,1.741429,2.361429,5.745,6.055714,6.632857,4.38625,4.81,2.487143,3.27375,11.42
3,2 Broke Girls,1,4,5899.921656,7097.7331,3386.857917,21.267363,20.248501,19.434308,684.38765,...,2.347143,3.048571,2.847143,7.688571,6.484286,6.192857,4.957143,3.747143,4.097143,10.71
4,2 Broke Girls,1,6,9570.497331,5883.807213,6103.310464,20.229745,21.176403,21.713179,648.841555,...,1.425714,1.9,2.015,6.475714,4.26,5.53875,4.502857,4.93,3.62375,11.19


In [8]:
# Check missing values
nlpdata.isna().sum().sort_values(ascending = False).head(5)

Viewership (millions)    227
Authentic_1                1
percept_2                  1
percept_1                  1
differ_2                   1
dtype: int64

In [9]:
# Drop rows with missing values in Viewership
nlpdata.dropna(subset=['Viewership (millions)'], inplace = True)

In [10]:
# Rename viewership column
nlpdata = nlpdata.rename(columns={'Viewership (millions)': 'vw'})

## IMDB Data

In [11]:
imdb.head()

Unnamed: 0,Show,Executive producers,Genre,Network,No.of episodes,No.of seasons,Starring,Year
0,2 Broke Girls,Michael Patrick King | Whitney Cummings | Mich...,Sitcom,CBS,138,6,Kat Dennings | Beth Behrs | Garrett Morris | J...,2011
1,24,Joel Surnow | Robert Cochran | Brian Grazer | ...,Serial drama | Crime thriller | Espionage | Ac...,Fox,192 + 24: Redemption + 12 (24: Live Another Day),9,Kiefer Sutherland | (and | others | ),2001
2,30 Rock,Lorne Michaels | Tina Fey | Marci Klein | Davi...,Sitcom | Satire | Farce,NBC,139,7,Tina Fey | Tracy Morgan | Jane Krakowski | Jac...,2006
3,3rd Rock from the Sun,John Lithgow | Bonnie Turner | Terry Turner | ...,Sitcom | Science fiction,NBC,139,6,John Lithgow | Kristen Johnston | French Stewa...,1996
4,9/1/2001,Ryan Murphy | Brad Falchuk | Tim Minear | Juan...,Procedural drama,Fox | ABC,96,6,Angela Bassett | Peter Krause | Oliver Stark |...,2018


In [12]:
print(imdb.shape)
imdb.dtypes

(304, 8)


Show                   object
Executive producers    object
Genre                  object
Network                object
No.of episodes         object
No.of seasons          object
Starring               object
Year                   object
dtype: object

In [13]:
# Filter rows where 'Episode' contains non-numeric characters
invalid_episodes = imdb[imdb['No.of episodes'].apply(lambda x: not str(x).isdigit())]
invalid_episodes[['Show', 'No.of episodes']]

Unnamed: 0,Show,No.of episodes
1,24,192 + 24: Redemption + 12 (24: Live Another Day)
35,Brooklyn Bridge,35 (includes 2 two-part episodes)
43,Caroline in the City,97 (1 unaired)
45,Cheers,275 (including three double-length episodes an...
60,Cursed,17 (2 unaired)
72,Diagnosis: Murder,178 + 5 TV movies + Pilot
92,Family Ties,176 + one film
95,Fired Up,28 (5 unaired)
111,Grand,26 (1 unaired)
119,Hearts Afire,54 (1 unaired)


In [14]:
# Convert relevant columns to appropriate types
imdb['Year'] = imdb['Year'].astype(int)

# Define the function to truncate after the first number
def truncate_after_first_number(text):
    match = re.search(r'\d+', str(text))  # Search for the first occurrence of one or more digits
    return match.group() if match else "0"  # Return the matched number as a string or '0' if no number is found

# Step 1: Handle non-numeric entries in 'No.of seasons'
imdb['No.of seasons'] = imdb['No.of seasons'].apply(truncate_after_first_number).astype(int)
# Step 2: Handle non-numeric entries in 'No.of episodes' and convert to integer
imdb['No.of episodes'] = imdb['No.of episodes'].apply(truncate_after_first_number).astype(int)

imdb.head()

Unnamed: 0,Show,Executive producers,Genre,Network,No.of episodes,No.of seasons,Starring,Year
0,2 Broke Girls,Michael Patrick King | Whitney Cummings | Mich...,Sitcom,CBS,138,6,Kat Dennings | Beth Behrs | Garrett Morris | J...,2011
1,24,Joel Surnow | Robert Cochran | Brian Grazer | ...,Serial drama | Crime thriller | Espionage | Ac...,Fox,192,9,Kiefer Sutherland | (and | others | ),2001
2,30 Rock,Lorne Michaels | Tina Fey | Marci Klein | Davi...,Sitcom | Satire | Farce,NBC,139,7,Tina Fey | Tracy Morgan | Jane Krakowski | Jac...,2006
3,3rd Rock from the Sun,John Lithgow | Bonnie Turner | Terry Turner | ...,Sitcom | Science fiction,NBC,139,6,John Lithgow | Kristen Johnston | French Stewa...,1996
4,9/1/2001,Ryan Murphy | Brad Falchuk | Tim Minear | Juan...,Procedural drama,Fox | ABC,96,6,Angela Bassett | Peter Krause | Oliver Stark |...,2018


In [15]:
# Check again for invalid episodes
invalid_episodes = imdb[imdb['No.of episodes'].apply(lambda x: not str(x).isdigit())]
invalid_episodes[['Show', 'No.of episodes']]

Unnamed: 0,Show,No.of episodes


In [16]:
#Drop irrelevant columns
imdb.drop(columns = ["Executive producers", "Starring"], errors='ignore', axis=1, inplace=True)

# Extract the first genre before '|'
imdb['Genre'] = imdb['Genre'].str.split('|').str[0].str.strip()
# Extract the first network before '|'
imdb['Network'] = imdb['Network'].str.split('|').str[0].str.strip()

In [17]:
imdb.head()

Unnamed: 0,Show,Genre,Network,No.of episodes,No.of seasons,Year
0,2 Broke Girls,Sitcom,CBS,138,6,2011
1,24,Serial drama,Fox,192,9,2001
2,30 Rock,Sitcom,NBC,139,7,2006
3,3rd Rock from the Sun,Sitcom,NBC,139,6,1996
4,9/1/2001,Procedural drama,Fox,96,6,2018


In [18]:
len(imdb.Show.unique())

304

## Air Date Data

In [19]:
# Splitting Episode ranges into separate rows e.g. episode 21-22 into 2 separate rows
airdate_data_split = pd.concat([pd.DataFrame({
    'Show': row['Show'],
    'Season': row['Season'],
    'Episode': episode,
    'Air Date': row['Air Date']
}, index=[0]) for _, row in airdate_data.iterrows() for episode in row['Episode'].split('-') if episode.isdigit()]).reset_index(drop=True)

# Convert 'Season' and 'Episode' columns to integers
airdate_data_split[['Season','Episode']] = airdate_data_split[['Season','Episode']].astype(int)

airdate_data_split.head()

Unnamed: 0,Show,Season,Episode,Air Date
0,2 Broke Girls,1,1,19-Sep-11
1,2 Broke Girls,1,2,26-Sep-11
2,2 Broke Girls,1,3,3-Oct-11
3,2 Broke Girls,1,4,10-Oct-11
4,2 Broke Girls,1,5,17-Oct-11


In [20]:
len(airdate_data_split.Show.unique())
airdate_data_split.dtypes

Show        object
Season       int32
Episode      int32
Air Date    object
dtype: object

In [21]:
# Function to check for invalid dates
def check_invalid_dates(df_date_column, date_format):
    invalid_indices = []
    for index, date_str in enumerate(df_date_column):
        try:
            pd.to_datetime(date_str, format=date_format)
        except ValueError:
            invalid_indices.append(index)
    return invalid_indices

# Check for invalid dates in 'Air Date' column
invalid_indices = check_invalid_dates(airdate_data_split['Air Date'], '%d-%b-%y')
print(len(invalid_indices), 'rows with invalid \'Air Date\' values')
airdate_data_split['Air Date'].iloc[invalid_indices]

29 rows with invalid 'Air Date' values


878                  Aired in syndication
879                  Aired in syndication
880                  Aired in syndication
1004                 "Career Day, Part 2"
4447                              Unaired
10092                             Unaired
10093                             Unaired
10094                             Unaired
10095                             Unaired
10096                             Unaired
11691                             Unaired
12748                             Unaired
16537                             Unaired
16538                             Unaired
16539                             Unaired
16540                             Unaired
16541                             Unaired
17307                             Unaired
21738                             Unaired
21739                             Unaired
24288                             Unaired
25478    Holly Hester & Apryl Huntzinger 
26692                             Unaired
26693                             

In [22]:
# Drop rows with invalid dates
airdate_data_split.drop(index=invalid_indices, inplace=True)

In [23]:
# Convert 'Air Date' column to datetime with correct format
airdate_data_split['Air Date'] = pd.to_datetime(airdate_data_split['Air Date'], format='%d-%b-%y')
airdate_data_split.head()

Unnamed: 0,Show,Season,Episode,Air Date
0,2 Broke Girls,1,1,2011-09-19
1,2 Broke Girls,1,2,2011-09-26
2,2 Broke Girls,1,3,2011-10-03
3,2 Broke Girls,1,4,2011-10-10
4,2 Broke Girls,1,5,2011-10-17


## Cancelled Shows

In [24]:
cancelled.head()

Unnamed: 0,Show,Cancelled
0,2 Broke Girls,1
1,24,0
2,30 Rock,0
3,3rd Rock from the Sun,0
4,9-1-1: Lone Star,2


In [25]:
cancelled.Cancelled.value_counts()

Cancelled
0    151
1     52
2     30
Name: count, dtype: int64

In [26]:
len(cancelled.Show.unique())

233

# Merge data

In [27]:
# Merge nlpdata and imdb dataframes on 'Show'
nlpdata_imdb = pd.merge(nlpdata, imdb, on='Show', how='inner')
nlpdata_imdb.head()

Unnamed: 0,Show,Season,Episode,sd_div_mean_1,sd_div_mean_2,sd_div_mean_3,sd_sum_1,sd_sum_2,sd_sum_3,sd_scaled_1,...,space_3,time_1,time_2,time_3,vw,Genre,Network,No.of episodes,No.of seasons,Year
0,2 Broke Girls,1,1,5658.35942,9159.676336,4626.990287,18.665142,20.545673,21.697182,598.633647,...,5.41375,4.322857,3.005714,2.5725,19.37,Sitcom,CBS,138,6,2011
1,2 Broke Girls,1,2,6280.502095,3451.804293,7127.226694,19.733889,20.345823,17.44019,629.842904,...,6.631429,4.492857,4.335714,3.932857,11.75,Sitcom,CBS,138,6,2011
2,2 Broke Girls,1,3,3931.604095,3977.461758,5506.511934,21.55942,20.759948,19.83783,694.243568,...,4.38625,4.81,2.487143,3.27375,11.42,Sitcom,CBS,138,6,2011
3,2 Broke Girls,1,4,5899.921656,7097.7331,3386.857917,21.267363,20.248501,19.434308,684.38765,...,6.192857,4.957143,3.747143,4.097143,10.71,Sitcom,CBS,138,6,2011
4,2 Broke Girls,1,6,9570.497331,5883.807213,6103.310464,20.229745,21.176403,21.713179,648.841555,...,5.53875,4.502857,4.93,3.62375,11.19,Sitcom,CBS,138,6,2011


In [28]:
len(nlpdata_imdb.Show.unique())

235

In [29]:
# Merge merged_data and data3_split based on 'Show', 'Season', and 'Episode'
nlpdata_imdb_airdate = pd.merge(nlpdata_imdb, airdate_data_split, on=['Show', 'Season', 'Episode'], how='inner')
nlpdata_imdb_airdate.head()

Unnamed: 0,Show,Season,Episode,sd_div_mean_1,sd_div_mean_2,sd_div_mean_3,sd_sum_1,sd_sum_2,sd_sum_3,sd_scaled_1,...,time_1,time_2,time_3,vw,Genre,Network,No.of episodes,No.of seasons,Year,Air Date
0,2 Broke Girls,1,1,5658.35942,9159.676336,4626.990287,18.665142,20.545673,21.697182,598.633647,...,4.322857,3.005714,2.5725,19.37,Sitcom,CBS,138,6,2011,2011-09-19
1,2 Broke Girls,1,2,6280.502095,3451.804293,7127.226694,19.733889,20.345823,17.44019,629.842904,...,4.492857,4.335714,3.932857,11.75,Sitcom,CBS,138,6,2011,2011-09-26
2,2 Broke Girls,1,3,3931.604095,3977.461758,5506.511934,21.55942,20.759948,19.83783,694.243568,...,4.81,2.487143,3.27375,11.42,Sitcom,CBS,138,6,2011,2011-10-03
3,2 Broke Girls,1,4,5899.921656,7097.7331,3386.857917,21.267363,20.248501,19.434308,684.38765,...,4.957143,3.747143,4.097143,10.71,Sitcom,CBS,138,6,2011,2011-10-10
4,2 Broke Girls,1,6,9570.497331,5883.807213,6103.310464,20.229745,21.176403,21.713179,648.841555,...,4.502857,4.93,3.62375,11.19,Sitcom,CBS,138,6,2011,2011-10-24


In [30]:
len(nlpdata_imdb_airdate.Show.unique())

235

In [31]:
# Left join (since we want cancelled and not cancelled shows)
merged_data = pd.merge(nlpdata_imdb_airdate, cancelled, on='Show', how='left')
merged_data.head()

Unnamed: 0,Show,Season,Episode,sd_div_mean_1,sd_div_mean_2,sd_div_mean_3,sd_sum_1,sd_sum_2,sd_sum_3,sd_scaled_1,...,time_2,time_3,vw,Genre,Network,No.of episodes,No.of seasons,Year,Air Date,Cancelled
0,2 Broke Girls,1,1,5658.35942,9159.676336,4626.990287,18.665142,20.545673,21.697182,598.633647,...,3.005714,2.5725,19.37,Sitcom,CBS,138,6,2011,2011-09-19,1.0
1,2 Broke Girls,1,2,6280.502095,3451.804293,7127.226694,19.733889,20.345823,17.44019,629.842904,...,4.335714,3.932857,11.75,Sitcom,CBS,138,6,2011,2011-09-26,1.0
2,2 Broke Girls,1,3,3931.604095,3977.461758,5506.511934,21.55942,20.759948,19.83783,694.243568,...,2.487143,3.27375,11.42,Sitcom,CBS,138,6,2011,2011-10-03,1.0
3,2 Broke Girls,1,4,5899.921656,7097.7331,3386.857917,21.267363,20.248501,19.434308,684.38765,...,3.747143,4.097143,10.71,Sitcom,CBS,138,6,2011,2011-10-10,1.0
4,2 Broke Girls,1,6,9570.497331,5883.807213,6103.310464,20.229745,21.176403,21.713179,648.841555,...,4.93,3.62375,11.19,Sitcom,CBS,138,6,2011,2011-10-24,1.0


In [32]:
len(merged_data.Show.unique())

235

In [105]:
# Save merged data into csv file
merged_data.to_csv('final_merged_data.csv', index=False)