In [37]:
import os
import numpy as np
import pandas as pd
import plotly.express as px
import dask.dataframe as dd
import glob

from notnews import *

In [2]:
FILE_PATH = '../dataverse_files/'
PROCESSED_PATH = FILE_PATH + 'processed/'
PREDICT_NEWS = False

In [3]:
def split_text(s, n):
    pieces = str(s).split()
    return (' '.join(pieces[i:i+n]) for i in range(0, len(pieces), n))

In [4]:
def split_df(df, column, num=500):
    indexes = list()
    new_values = list()
    df = df.dropna(subset=[column])
    for i, presplit in enumerate(df[column].astype(str)):
        w_generator = split_text(presplit, num)
        for word in w_generator:
            indexes.append(i)
            new_values.append(word)
    new_df = df.iloc[indexes, :].copy()
    new_df[column] = new_values
    return new_df

In [5]:
if (PREDICT_NEWS):
    for file in os.listdir('../dataverse_files/'):
        if file.endswith(".csv"):
            print (file)
            dd = pd.read_csv( FILE_PATH + str(file), encoding = 'ISO-8859-1')
            print (f'Processing: {FILE_PATH + str(file)} ....')
            split_dd = split_df(dd, 'text')
            print (f'Finished splitting {FILE_PATH + str(file)} into chunks')
            soft_news_df = pred_soft_news_us(split_dd, col='text')
            print (f'Saving results for {FILE_PATH + str(file)}\n\n')
            soft_news_df.to_csv('{}soft_news_{}'.format(PROCESSED_PATH, file))
            os.rename(FILE_PATH+file, PROCESSED_PATH+file)

In [27]:
#full_df = pd.DataFrame(columns=['channel.name','program.name','full_date','prob_soft_news_us'])
#full_df = dd.from_pandas(full_df, chunksize=250000)
full_df = pd.DataFrame()

In [52]:
# Processing CNN files
processed_files = glob.glob(PROCESSED_PATH + 'soft_news_cnn*.csv')


for file in processed_files:
    news_df = pd.read_csv(file, encoding = 'ISO-8859-1')

    # delete any outliers
    news_df = news_df.drop(news_df[news_df['year'] > 2100.0].index , axis=0)
    news_df = news_df.drop(news_df[news_df['year'] < 1900.0].index , axis=0)    

    # generate date field in format YYYY-MM-DD
    news_df['full_date'] = pd.to_datetime((news_df.year*10000+news_df.month*100+news_df.date),format='%Y%m%d')
    #news_df.drop(news_df.loc[news_df['full_date'] > np.datetime64('2100-01-01T01:00:00.000000')].index, inplace=True)
    #news_df.drop(news_df.loc[news_df['full_date'] < np.datetime64('1900-01-01T01:00:00.000000')].index, inplace=True) 
    
    # cleaning up the dataframe and appending to the full dataframe
    news_df = news_df[news_df.columns[news_df.columns.isin(['channel.name','program.name','full_date','prob_soft_news_us'])]]
    full_df = pd.concat([full_df, news_df], axis=0)
    

In [54]:
full_df.shape

(3699388, 4)

In [55]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3699388 entries, 0 to 1114473
Data columns (total 4 columns):
 #   Column             Dtype         
---  ------             -----         
 0   channel.name       object        
 1   program.name       object        
 2   prob_soft_news_us  float64       
 3   full_date          datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 141.1+ MB
