In [1]:
import os
import re
import glob
import codecs
import pathlib

from tqdm import tqdm

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import utils

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
sources_files = utils.get_sources_files()
text_folders = utils.get_text_folders()

In [5]:
%%time

summary_stats, file_lookup = [], []
current_year, current_month = None, None
for i, sources_file in enumerate(sources_files, 1): 
    print(f"Source file ({i:>02}/{len(sources_files):>02}): {sources_file}... ", end="", flush=True)
    
    sources = utils.read_sources_file(sources_file)
    sources["month"] = sources["date"].dt.month
    sources["year"] = sources["date"].dt.year

    sources_groups = sources.groupby(["year", "month", "country"])
    for (year, month, country), df in sources_groups:
        if (
            sources["year"].unique().shape[0] > 1 and
            (year != current_year or month != current_month)
        ):
            print(f"\n  {utils.format_year(year)}-{utils.format_month(month)}: ", end="", flush=True)
            current_year, current_month = year, month
            
        print(country, end=", ", flush=True)
        summary_stats.append((year, month, country, *utils.get_basic_summary_stats(df)))
        
        try:
            text_file_path = utils.get_text_file_path(text_folders, year, month, country)
        except:
            text_file_path = None
            
        file_lookup.append((year, month, country, sources_file, text_file_path))
        
        if text_file_path:
            articles = df.merge(utils.read_text_file(text_file_path), on="id", how="outer")
            articles.apply(utils.export_report, axis=1, path=utils.CLEAN_DATA_FOLDER)
    
    print()
    
summary_stats_df = pd.DataFrame(summary_stats, columns=["year", "month", "country", "num_sources", "num_articles", "total_words"])
summary_stats_df.to_csv("summary_stats.csv", index=False)

file_lookup_df = pd.DataFrame(file_lookup, columns=["year", "month", "country", "source_file", "text_file"])
file_lookup_df.to_csv("file_lookup.csv", index=False)

Source file (01/49): data\raw\now_sources_pt1.txt... 
  10-01: AU, BD, CA, GB, GH, HK, IE, IN, JM, KE, LK, MY, NG, NZ, PH, PK, SG, TZ, US, ZA, 
  10-02: AU, BD, CA, GB, GH, HK, IE, IN, JM, KE, LK, MY, NG, NZ, PH, PK, SG, TZ, US, ZA, 
  10-03: AU, BD, CA, GB, GH, HK, IE, IN, JM, KE, LK, MY, NG, NZ, PH, PK, SG, TZ, US, ZA, 
  10-04: AU, BD, CA, GB, GH, HK, IE, IN, JM, KE, LK, MY, NG, NZ, PH, PK, SG, US, ZA, 
  10-05: AU, BD, CA, GB, GH, HK, IE, IN, JM, KE, LK, MY, NG, NZ, PH, PK, SG, TZ, US, ZA, 
  10-06: AU, BD, CA, GB, GH, HK, IE, IN, JM, KE, LK, MY, NG, NZ, PH, PK, SG, TZ, US, ZA, 
  10-07: AU, BD, CA, GB, GH, HK, IE, IN, JM, KE, LK, MY, NG, NZ, PH, PK, SG, US, ZA, 
  10-08: AU, BD, CA, GB, GH, HK, IE, IN, JM, KE, LK, MY, NG, NZ, PH, PK, SG, TZ, US, ZA, 
  10-09: AU, BD, CA, GB, GH, HK, IE, IN, JM, KE, LK, MY, NG, NZ, PH, PK, SG, US, ZA, 
  10-10: AU, BD, CA, GB, GH, HK, IE, IN, JM, KE, LK, MY, NG, NZ, PH, PK, SG, TZ, US, ZA, 
  10-11: AU, BD, CA, GB, GH, HK, IE, IN, JM, KE, LK, MY, N

AttributeError: 'NoneType' object has no attribute 'groups'

In [101]:
file_lookup_df[file_lookup_df.text_file.isna()]

Unnamed: 0,year,month,country,source_file,text_file
357,2011,7,AU,data\raw\now_sources_pt1.txt,
497,2012,2,AU,data\raw\now_sources_pt1.txt,
517,2012,3,AU,data\raw\now_sources_pt1.txt,
577,2012,6,AU,data\raw\now_sources_pt1.txt,


In [93]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [94]:
summary_stats.append((year, month, country, *utils.get_basic_summary_stats(df)))

text_file_path = utils.get_text_file_path(text_folders, year, month, country)
file_lookup.append((year, month, country, sources_file, text_file_path))

articles = df.merge(utils.read_text_file(text_file_path), on="id", how="outer").sample(10, replace=True)
articles.apply(utils.export_report, axis=1, path=utils.CLEAN_DATA_FOLDER)

None


TypeError: join() argument must be str, bytes, or os.PathLike object, not 'NoneType'

In [70]:
summary_stats

[(2010, 1, 'AU', 61, 1176, 696703)]

In [65]:
text_file_path = utils.get_text_file_path(text_folders, year, month, country)

ValueError: not enough values to unpack (expected 5, got 4)

In [57]:
summary_stats_df.head()

Unnamed: 0,year,month,country,num_sources,num_articles,total_words
0,2017,1,AU,485,19773,10550020
1,2017,1,BD,9,1256,482402
2,2017,1,CA,646,20941,12669330
3,2017,1,GB,1291,24667,13377506
4,2017,1,GH,19,8690,2970382


In [59]:
file_lookup_df.head()

Unnamed: 0,year,month,country,source_file,text_file
0,2017,1,AU,data\raw\sources-17-01.txt,data\raw\text-17-01\text_17-01-AU.txt
1,2017,1,BD,data\raw\sources-17-01.txt,data\raw\text-17-01\text_17-01-BD.txt
2,2017,1,CA,data\raw\sources-17-01.txt,data\raw\text-17-01\text_17-01-CA.txt
3,2017,1,GB,data\raw\sources-17-01.txt,data\raw\text-17-01\text_17-01-GB.txt
4,2017,1,GH,data\raw\sources-17-01.txt,data\raw\text-17-01\text_17-01-GH.txt


In [None]:
groups = reporuniquegroupby(["country", "year"]).id.count().to_frame().reset_index()

fig, ax = plt.subplots(figsize=(20, 8))
sns.barplot(
    x="country", 
    y="id", 
    hue="year",
    data=groups,
    ax=ax
)

ax.set_xlabel("Country", fontsize=16)
ax.set_ylabel("Number of Reports", fontsize=16)
ax.set_title("Number of Reports per Country per Year", fontsize=22)

ax.tick_params(axis='both', which='major', labelsize=13)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=12);

In [None]:
words_per_year = reports[['n_words','year','country']].groupby(['country','year']).sum().reset_index()

fig, ax = plt.subplots(figsize=(20, 8))
sns.barplot(
    x="country", 
    y="n_words", 
    hue="year",
    data=words_per_year,
    ax=ax
)
ax.set_xlabel("Country", fontsize=16)
ax.set_ylabel("Number of Words", fontsize=16)
ax.set_title("Number of Words per Country per Year", fontsize=22)

ax.tick_params(axis='both', which='major', labelsize=13)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=12);

Example, news report with textID 11241:
- pushed to a file with name 11241_US_13-01-06.txt. 
- 11241_US_13-01-06.txt should be in the US/2006 folder. 

In [None]:
reports["export_folder"] = reports.apply(export_report, axis=1, path="data/clean")

reports.to_csv("data/clean/all_reports.csv", index=False)
reports

-----------------------------------------

In [None]:
BBC_df = reports[reports['website'] == 'BBC News']

In [None]:
BBC_df['vector'] = BBC_df['text'].str.lower().str.split()

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [None]:
import string
print(string.punctuation)

In [None]:
BBC_df['vector'] = BBC_df['vector'].apply(lambda x: [item for item in x if item not in stop])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(BBC_df['vector'])
print(vectorizer.get_feature_names())

In [None]:
reports['website'].value_counts()[reports['website'].value_counts() > 10]

In [None]:
reports['website'].value_counts()

In [None]:
sum(reports['text'] == np.nan)

In [None]:
reports_trim = reports[reports['text'] == ]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

v = TfidfVectorizer(stop_words='english')
x = v.fit_transform(reports[reports['text'].notnull()]['text'])

In [None]:
x

In [None]:
feature_array = np.array(v.get_feature_names())
tfidf_sorting = np.argsort(x.toarray()).flatten()[::-1]

In [None]:
x.toarray()

In [None]:
len(v.get_feature_names())