In [1]:
import os
import numpy as np
import pandas as pd
import glob
import matplotlib.pyplot as plt
import seaborn as sns
import re

from fuzzywuzzy import process, fuzz
from notnews import *

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Bashar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/Bashar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
FILE_PATH = '../dataverse_files/'
PROCESSED_PATH = FILE_PATH + 'processed/'
PREDICT_NEWS = False

In [3]:
def split_text(s, n):
    pieces = str(s).split()
    return (' '.join(pieces[i:i+n]) for i in range(0, len(pieces), n))

In [4]:
def split_df(df, column, num=500):
    indexes = list()
    new_values = list()
    df = df.dropna(subset=[column])
    for i, presplit in enumerate(df[column].astype(str)):
        presplit = pattern.sub('', presplit)
        w_generator = split_text(presplit, num)
        for word in w_generator:
            indexes.append(i)
            new_values.append(word)
    new_df = df.iloc[indexes, :].copy()
    new_df[column] = new_values
    return new_df

In [5]:
def predict_soft_news_file(file, channel_name, col='text'):
    print (file)
    df = pd.read_csv( file, encoding = 'ISO-8859-1')
    print (f'Processing: {file} ....')
    split_data = split_df(df, 'text')
    print (f'Finished splitting {file} into chunks')
    soft_news_df = pred_soft_news_us(split_data, col=col)
    print (f'Saving results for {file}\n\n')
    filename = file.split('/')[-1]
    soft_news_df.to_csv('{}soft_news_{}'.format(PROCESSED_PATH, filename))
    os.rename(file, PROCESSED_PATH+filename)
    
    return soft_news_df

# Predicting Soft News probability
If PREDICT_NEWS = True then will run, this is very time consuming and should be run once
After which, the files generated should be saved to disk and preserved for future analysis
  as run time could take days depending on the number of records to analyze

In [6]:
# definining pattern of timestamps in transcripts to remove
pattern = re.compile(r'\[[^a-zA-Z]*\]')

In [7]:
full_df = pd.DataFrame()

In [None]:
# ingesting CNN Files
files = glob.glob(FILE_PATH + 'cnn*.csv')

for file in files:
    soft_news_df = predict_soft_news_file(str(file), 'CNN')
    full_df = pd.concat([full_df, soft_news_df], axis=0)

# ingesting NBC News Files
file = FILE_PATH + 'msnbc.csv'
soft_news_df = predict_soft_news_file(file, 'NBC News')
full_df = pd.concat([full_df, soft_news_df], axis=0)

../dataverse_files/cnn-1.csv
Processing: ../dataverse_files/cnn-1.csv ....
Finished splitting ../dataverse_files/cnn-1.csv into chunks
Using model data from /Users/Bashar/opt/anaconda3/envs/nonconform/lib/python3.8/site-packages/notnews/data/us_model/nyt_us_soft_news_classifier.joblib...
Using vectorizer data from /Users/Bashar/opt/anaconda3/envs/nonconform/lib/python3.8/site-packages/notnews/data/us_model/nyt_us_soft_news_vectorizer.joblib...
Loading the model and vectorizer data file...
Saving results for ../dataverse_files/cnn-1.csv


../dataverse_files/cnn-2.csv
Processing: ../dataverse_files/cnn-2.csv ....
Finished splitting ../dataverse_files/cnn-2.csv into chunks
Saving results for ../dataverse_files/cnn-2.csv


../dataverse_files/cnn-3.csv
Processing: ../dataverse_files/cnn-3.csv ....
Finished splitting ../dataverse_files/cnn-3.csv into chunks
Saving results for ../dataverse_files/cnn-3.csv


../dataverse_files/cnn-4.csv
Processing: ../dataverse_files/cnn-4.csv ....
Finished sp

### Saving combined file to disk

In [None]:
full_df.to_csv(PROCESSED_PATH+'cnn_nbc_news_soft_news_predict.csv')

# ingesting MSNBC Files
file = FILE_PATH + 'msnbc--2003--2014.csv'
soft_news_df = predict_soft_news_file(file, 'MSNBC')
full_df = full_df.concat(soft_news_df, axis=0)

file = FILE_PATH + 'msnbc--2003--2014.csv'
soft_news_df = predict_soft_news_file(file, 'MSNBC')
full_df = full_df.concat(soft_news_df, axis=0)