In [1]:
import os
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import glob
import matplotlib.pyplot as plt
import seaborn as sns

from fuzzywuzzy import process, fuzz
from notnews import *

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Bashar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/Bashar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
FILE_PATH = '../dataverse_files/'
PROCESSED_PATH = FILE_PATH + 'processed/'
PREDICT_NEWS = False

In [3]:
def split_text(s, n):
    pieces = str(s).split()
    return (' '.join(pieces[i:i+n]) for i in range(0, len(pieces), n))

In [4]:
def split_df(df, column, num=500):
    indexes = list()
    new_values = list()
    df = df.dropna(subset=[column])
    for i, presplit in enumerate(df[column].astype(str)):
        w_generator = split_text(presplit, num)
        for word in w_generator:
            indexes.append(i)
            new_values.append(word)
    new_df = df.iloc[indexes, :].copy()
    new_df[column] = new_values
    return new_df

# Predicting Soft News probability
If PREDICT_NEWS = True then will run, this is very time consuming and should be run once
After which, the files generated should be saved to disk and preserved for future analysis
  as run time could take days depending on the number of records to analyze

In [5]:
if (PREDICT_NEWS):
    for file in os.listdir('../dataverse_files/'):
        if file.endswith(".csv"):
            print (file)
            dd = pd.read_csv( FILE_PATH + str(file), encoding = 'ISO-8859-1')
            print (f'Processing: {FILE_PATH + str(file)} ....')
            split_dd = split_df(dd, 'text')
            print (f'Finished splitting {FILE_PATH + str(file)} into chunks')
            soft_news_df = pred_soft_news_us(split_dd, col='text')
            print (f'Saving results for {FILE_PATH + str(file)}\n\n')
            soft_news_df.to_csv('{}soft_news_{}'.format(PROCESSED_PATH, file))
            os.rename(FILE_PATH+file, PROCESSED_PATH+file)

# Processing files

In [6]:
full_df = pd.DataFrame()

## Processing CNN Files

In [None]:
# Processing CNN files
processed_files = glob.glob(PROCESSED_PATH + 'soft_news_cnn*.csv')


for file in processed_files:
    news_df = pd.read_csv(file, encoding = 'ISO-8859-1')

    # Default to CNN and then get the index prior to splitting into mulit-records and assign all records
    #  to CNN International if the transcript contains CNN International in its text.
    news_df['channel.name'] = 'CNN'
    mod_list = news_df.loc[news_df['text'].str.contains('CNN INTERNATIONAL', case=False)]['Unnamed: 0'].tolist()    
    news_df.iloc[mod_list,2] = 'CNN International'
    
    # delete any outliers
    news_df.drop(news_df[news_df['year'] > 2100.0].index , axis=0, inplace=True)
    news_df.drop(news_df[news_df['year'] < 1900.0].index , axis=0, inplace=True)    

    # generate date field in format YYYY-MM-DD
    news_df['full_date'] = pd.to_datetime((news_df.year*10000+news_df.month*100+news_df.date),format='%Y%m%d')
    
    # cleaning up the dataframe and appending to the full dataframe
    news_df = news_df[news_df.columns[news_df.columns.isin(['channel.name','program.name','full_date','prob_soft_news_us'])]]
    full_df = pd.concat([full_df, news_df], axis=0)
    

In [None]:
del news_df
full_df.shape

In [None]:
full_df.info()

## Processing NBC/MSNBC files
these files have different formats so we'll need to process individually

### MSNBC 2003-2014 data

In [None]:
news_df = pd.read_csv('../dataverse_files/processed/soft_news_msnbc--2003--2014.csv')
news_df.rename(columns={'Source':'channel.name', 'Show':'program.name'},inplace=True)
news_df['full_date'] = pd.to_datetime(news_df.Date)

# keeping only the required columns in the dataframe
news_df = news_df[news_df.columns[news_df.columns.isin(['channel.name','program.name','full_date','prob_soft_news_us'])]]

### MSNBC 2010-2021 data

In [None]:
news2_df = pd.read_csv('../dataverse_files/processed/soft_news_msnbc-2010--2021.csv')
news2_df.drop('program.name', axis=1, inplace=True)
news2_df.rename(columns={'Source':'channel.name', 'show_name':'program.name'},inplace=True)
news2_df['full_date'] = pd.to_datetime(news2_df.air_date)

# keeping only the required columns in the dataframe
news2_df = news2_df[news2_df.columns[news2_df.columns.isin(['channel.name','program.name','full_date','prob_soft_news_us'])]]


In [None]:
print (f'Size of 2003-2014 df: {news_df.shape}')
print (f'Size of 2010-2021 df: {news2_df.shape}')

Merging the 2 dataframes to see if there is any redundant records

In [None]:
news_df.merge(news2_df, how='outer', on=['channel.name','program.name','full_date', 'prob_soft_news_us'])

In [None]:
full_df = pd.concat([full_df, news_df], axis=0)

In [None]:
full_df.shape

In [None]:
del news2_df

### Processing NBC News data

In [None]:
news_df = pd.read_csv('../dataverse_files/processed/soft_news_msnbc.csv', encoding = 'ISO-8859-1')
news_df['channel.name'] = 'NBC News'

In [None]:
news_df.drop(news_df[news_df['date'].isnull()].index, axis=0, inplace=True)

In [None]:
news_df = news_df.drop(news_df[news_df['year'] > 2100.0].index , axis=0)
news_df = news_df.drop(news_df[news_df['year'] < 1900.0].index , axis=0)    

news_df['full_date'] = pd.to_datetime((news_df.year*10000+news_df.month*100+news_df.date),format='%Y%m%d')

In [None]:
# keeping only the required columns in the dataframe
news_df = news_df[news_df.columns[news_df.columns.isin(['channel.name','program.name','full_date','prob_soft_news_us'])]]

full_df = pd.concat([full_df, news_df], axis=0)

In [None]:
full_df.drop(full_df[full_df['program.name'].isnull()].index, axis=0, inplace=True)
full_df.shape

In [None]:
del news_df

# Cleaning Data

In [None]:
def get_matching_shows(df):
    unique_shows = df['program.name'].unique().tolist()
    
    # creating a tuple with each unique show and its closest matches with FuzzyWuzzy token sort ration method
    match_tuple = [(x,) + i
                   for x in unique_shows
                   for i in process.extract(x,unique_shows, scorer=fuzz.token_sort_ratio)]

    analysis_df = pd.DataFrame(columns=['program.name','matched.program', 'score'], data=match_tuple)
    
    return analysis_df

In [None]:
matching_show_df = get_matching_shows(full_df)

# filter for shows that only match > 70
matching_show_df = matching_show_df[(matching_show_df.score > 80) & (matching_show_df.score < 100)]
matching_show_df.to_csv('../dataverse_files/raw_matching_shows.csv')

After exporting the matching program names with fuzzy wuzzy, manually went through the file and only kept one show to replace each program.  If a show listed was not to be changed then the entry was ommitted.  This was used as a lookup in the code below to do most of the heavy lifting of converting shows into the same string.  After that we still had some that had to be edited manaually

In [None]:
matching_show_df = pd.read_csv('../dataverse_files/show_lookup.csv')
matching_show_df.reset_index(drop=True)
matching_show_df.drop(['Unnamed: 0', 'score'],axis=1, inplace=True)
matching_show_df

In [None]:
full_df.loc[full_df['program.name'].isin(matching_show_df['program.name']),['program.name']] = matching_show_df['matched.program']


In [None]:
full_df['program.name'] = full_df['program.name'].str.title()

In [None]:
full_df.drop(full_df[full_df['program.name'].isnull()].index, axis=0, inplace=True)

In [None]:
sorted(full_df['program.name'].unique().tolist())

In [None]:
# Fixing the shows that fuzzy wuzzy didn't catch, the list was small enough that doing it by 
full_df.loc[full_df['program.name']=='Ac 360 Degrees','program.name'] = 'Anderson Cooper 360 Degrees'
full_df.loc[full_df['program.name']=='American Morning With Paula Zahn','program.name'] = 'American Morning'
full_df.loc[full_df['program.name']=='Cnn American Morning With Paula Zahn','program.name'] = 'American Morning'
full_df.loc[full_df['program.name']=='Cnn International Q&A;','program.name'] = 'Cnn International Q&A'
full_df.loc[full_df['program.name']=='Cnn Late Edition With Wolf Blitzer','program.name'] = 'Cnn Late Edition'
full_df.loc[full_df['program.name']=='Cnn News Night Aaron Brown','program.name'] = 'Cnn Newsnight With Aaron Brown'
full_df.loc[full_df['program.name']=='Cnn Newsnight Aaron Brown','program.name'] = 'Cnn Newsnight With Aaron Brown'
full_df.loc[full_df['program.name']=='Cnn Page One With Nick Charles','program.name'] = 'Cnn Page One'
full_df.loc[full_df['program.name']=='Cnn Saturday Edition','program.name'] = 'Cnn Saturday'
full_df.loc[full_df['program.name']=='Cnn Showdown On Iraq','program.name'] = 'Cnn Showdown: Iraq'
full_df.loc[full_df['program.name']=='Cnn The Point With Greta Van Susteren','program.name'] = 'Cnn The Point'
full_df.loc[full_df['program.name']=='Cnn The Spin Room Corrected Copy','program.name'] = 'Cnn The Spin Room'
full_df.loc[full_df['program.name']=="CNN'S AMANPOUR",'program.name'] = 'Amanpour'
full_df.loc[full_df['program.name']=='Cnn&Time;','program.name'] = 'CNN/Time'
full_df.loc[full_df['program.name']=='Evans, Novak, Hunt & Shields','program.name'] = 'Cnn Evans, Novak, Hunt & Shields'
full_df.loc[full_df['program.name']=='Hardball With Chris Matthews','program.name'] = 'Hardball'
full_df.loc[full_df['program.name']=="Hardball With Chris Matthews' Fortuesday",'program.name'] = 'Hardball'
full_df.loc[full_df['program.name']=="Hardball With Chris Matthews' Forwednesday",'program.name'] = 'Hardball'
full_df.loc[full_df['program.name']=='Jane Velez-Mitchell','program.name'] = 'Issues With Jane Velez-Mitchell'
full_df.loc[full_df['program.name']=="Judy Woodruffs'S Inside Politics",'program.name'] = "Judy Woodruff'S Inside Politics"
full_df.loc[full_df['program.name']=='Melissa-Harris-Perry','program.name'] = 'The Melissa Harris-Perry Show'
full_df.loc[full_df['program.name']=='Melissa Harris-Perry','program.name'] = 'The Melissa Harris-Perry Show'
full_df.loc[full_df['program.name']=='Msnbc Hardball','program.name'] = 'Hardball'
full_df.loc[full_df['program.name']=='Politicsnation','program.name'] = 'Politics Nation'
full_df.loc[full_df['program.name']=='The Ed Show With Ed Schultz','program.name'] = 'The Ed Show'
full_df.loc[full_df['program.name']=='The Ed Show Forthursday,July 19Th','program.name'] = 'The Ed Show'
full_df.loc[full_df['program.name']=="The Last Word With Lawrence O' Donnell",'program.name'] = "The Last Word With Lawrence O'Donnell"
full_df.loc[full_df['program.name']=="The Last Word With Lawrence O'Donnell' Forthursday",'program.name'] = "The Last Word With Lawrence O'Donnell"
full_df.loc[full_df['program.name']=="The Last Word With Lawrence O'Donnell' Fortuesday",'program.name'] = "The Last Word With Lawrence O'Donnell"
full_df.loc[full_df['program.name']=="The Last Word With Lawrence O'Donnell' Fothursday",'program.name'] = "The Last Word With Lawrence O'Donnell"
full_df.loc[full_df['program.name']=="The Last Word With Lawrence O'Donnell' Wednesday",'program.name'] = "The Last Word With Lawrence O'Donnell"
full_df.loc[full_df['program.name']=='The Point With Greta Van Susteren','program.name'] = 'The Point'
full_df.loc[full_df['program.name']=="The Rachel Maddow Show'Forã\x82Â\xa0 Monday",'program.name'] = 'The Rachel Maddow Show'
full_df.loc[full_df['program.name']=='World Beat','program.name'] = 'Worldbeat'


In [None]:
# final cosmetic changes
full_df['program.name'] = full_df['program.name'].str.replace('Cnn','CNN')
full_df['program.name'] = full_df['program.name'].str.replace('Msnbc','MSNBC')
full_df['program.name'] = full_df['program.name'].str.replace("'S","'s")

# Analysis Plots

Filtering for shows that have had at least 100 transcripts, even though we split the long transcipts into mulitple lines if it was over 5000 words long this is a good enough estimate at this point

In [None]:
filt_soft_news_df = full_df[full_df.groupby(['program.name','channel.name'])['prob_soft_news_us'].transform('count') > 100]


In [None]:
# Comparing the size of the original dataframe to the new filtered dataframe
print (f'Original dataframe size: {full_df.shape}')
print (f'Filtered dataframe size: {filt_soft_news_df.shape}')

In [None]:
soft_news_df_grp = filt_soft_news_df.groupby(['program.name','channel.name'])['prob_soft_news_us'].mean().reset_index()
soft_news_df_grp = soft_news_df_grp.sort_values(['channel.name','program.name'])

In [None]:
# Use column names of df for the different parameters x, y, color, ...

fig, ax = plt.subplots(figsize=(10,soft_news_df_grp.shape[0]/3))

sns.set_style("darkgrid", {"axes.facecolor": ".9"})
sns.scatterplot(x="prob_soft_news_us", y="program.name", data=soft_news_df_grp,
                      hue="channel.name", ax=ax, size_norm=.2
                )