# 1. Mounting Google Drive
Follow this guide (using last method, "Bonus Method"):
https://towardsdatascience.com/3-ways-to-load-csv-files-into-colab-7c14fcbdcb92 

Step 1: Add "IS425 Text Mining Dataset" folder from our shared folder into "My Drive" (right-click TM folder > "add to My Drive")

In [1]:
from google.colab import drive
drive.mount('drive', force_remount=True)

In [2]:
%cd "drive/My Drive"

# 2. Specifying Dependencies

In [31]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import multiprocessing as mp

from statistics import mean
from scipy import stats

import nltk
import os
import time

import math
import re
import string

import itertools


# The following statement imports a class called PlaintextCorpusReader.
from nltk.corpus import PlaintextCorpusReader

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [27]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# 3. Loading Dataset

## 3.1 RAW Files

In [46]:
# #Run if need access to corpus. Separated it because it takes a long time to run
raw_folder = "/content/drive/My Drive/IS 425 Text Mining Dataset/ARC/raw/"
interim_folder = "/content/drive/My Drive/IS 425 Text Mining Dataset/ARC/interim/"
arc_corpus_filename = "ARC_Corpus.txt"

# raw_folder = "../../data/raw/"
# interim_folder = "../../data/interim"
# arc_corpus_filename = "ARC_Corpus.txt"

In [9]:
f = open(os.path.join(raw_folder, arc_corpus_filename), "r")

arc_lines = f.read().splitlines()

f.close()

## 3.2 Splitting the data into chunks

In [10]:
n_cores = 60
chunk_size = math.ceil(len(arc_lines) / n_cores)
chunk_size = int(chunk_size)

In [11]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

arc_lines_chunk =list(chunks(arc_lines, chunk_size))
print(len(arc_lines_chunk))

60


In [12]:
arc_lines[0]

'Large international companies are involved in bauxite, iron ore, diamond, and gold mining operations.'

In [13]:
arc_lines[1]

'Paleoceanography, 8(2): 193-208.'

# 4. Data Cleaning

## 4.1 Expand Contractions

In [14]:
def expand_contraction(lines):
    new_lines = []
    for line in lines:
            
        # replacing 's to nothing
        temp = re.sub(r"'s", "", line)
        # replacing 've to have
        temp = re.sub(r"'ve", ' have', temp)
        # replacing 're to are
        temp = re.sub(r"'re", ' are', temp)
        # replacing n't to not
        temp = re.sub(r"n't", ' not', temp)
        
        # replacing 're to are
        temp = re.sub(r"'d", ' would', temp)
        # replacing n't to not
        temp = re.sub(r"'ll", ' will', temp)
        # replacing 're to are
        temp = re.sub(r"'m", ' am', temp)
        
        new_lines.append(temp)
    return new_lines

In [15]:
start = time.time()

with mp.Pool(n_cores) as p:
    arc_lines_expanded = p.map(expand_contraction, arc_lines_chunk)

end = time.time()
print(end - start)

31.14186429977417


## 4.2 Cleaning Text (e.g. urls and punctuations)

In [16]:
def cleaning_text(lines):
    new_lines = []
    for line in lines:
        # convert all tweets to lower case
        temp = line.lower()
        # remove www and http URLs
        temp = re.sub('((www.\S+)|(http\S+))','',temp)
        # some words have "an- tlu-opologist"
        temp = temp.replace("- ", "")
        # some words have "href\\"
        temp = temp.replace("\\", "")
        
        temp = re.sub(pattern=r'[{}]'.format(string.punctuation), 
                repl='', 
                string=temp
               ).strip()
        
        new_lines.append(temp)
    return new_lines

In [17]:
start = time.time()

with mp.Pool(n_cores) as p:
    arc_lines_cleaned = p.map(cleaning_text, arc_lines_expanded)

end = time.time()
print(end - start)

32.61854815483093


## 4.3 Tokenizing sentences into words

In [18]:
def tokenizing_sentences(lines):
    new_lines = []
    for line in lines:
        temp = line.split()
        
        if temp != [] and temp != ['']:
            new_lines.append(temp)
    return new_lines

In [19]:
start = time.time()

with mp.Pool(n_cores) as p:
    arc_lines_tokenized = p.map(tokenizing_sentences, arc_lines_cleaned)

end = time.time()
print(end - start)

213.1732075214386


## 4.4 Finding/Consolidating Additional Contractions

In [20]:
def find_contractions(lines):
    new_list = []
    punctuations = set(string.punctuation)
    
    for line in lines:
        new_line = []
        for word in line:
            if any([char in word for char in punctuations]):
                new_line.append(word)
            
        new_list.append(new_line)
    return new_list

In [21]:
start = time.time()

with mp.Pool(n_cores) as p:
    contractions_chunk = p.map(find_contractions, arc_lines_tokenized)

end = time.time()
print(end - start)

200.70624136924744


In [22]:
lists_of_contractions = []
for chunk in contractions_chunk:
    for index, line in enumerate(chunk[0:50]):
        lists_of_contractions += line

print(len(lists_of_contractions))
lists_of_contractions = set(lists_of_contractions)
print(len(lists_of_contractions))

0
0


## 4.5 Stopword Removal

In [25]:
def stopword_removal(lines):
    new_list = []
    for line in lines:
        stop_list = set(stopwords.words('english'))
        # remove all stop words and not null
        words = [token.strip() for token in line if token.strip() not in stop_list and token.strip() != '']
        new_list.append(words)
    return new_list

In [28]:
start = time.time()

with mp.Pool(n_cores) as p:
    arc_lines_stopwords = p.map(stopword_removal, arc_lines_tokenized)

end = time.time()
print(end - start)

231.5580952167511


# 5 Generate DataFrame (For Data Exploration)

In [34]:
arc_lines_tokenized = list(itertools.chain(*arc_lines_tokenized))
arc_lines_stopwords = list(itertools.chain(*arc_lines_stopwords))

arc_lines = {
    "tokenized": arc_lines_tokenized,
    "stopwords_cleaned": arc_lines_stopwords
}

In [35]:
df = pd.DataFrame(arc_lines)

In [36]:
df['tokenized_len'] = df['tokenized'].apply(lambda x: len(x))
df['stopwords_len'] = df['stopwords_cleaned'].apply(lambda x: len(x))

In [38]:
round(df.describe(),2)

Unnamed: 0,tokenized_len,stopwords_len
count,14621720.0,14621720.0
mean,16.22,9.75
std,12.52,7.74
min,1.0,0.0
25%,7.0,5.0
50%,14.0,8.0
75%,22.0,13.0
max,616.0,616.0


## 5.1 Remove sentences with less than 5 words (stop words removed)

In [40]:
print("Before Filtering:", df.shape[0])
filtered_df = df[df['stopwords_len'] > 5]
print("After Filtering:", filtered_df.shape[0])

Before Filtering: 14621720
After Filtering: 10052037


## 5.2 Generating a subset of the dataset (5%)

In [42]:
filtered_df.quantile([0.9, 0.95, 1])

Unnamed: 0,tokenized_len,stopwords_len
0.9,35.0,20.0
0.95,42.0,25.0
1.0,616.0,616.0


In [44]:
top_5_percentile_df = filtered_df[filtered_df['stopwords_len'] >= 25]
top_5_percentile_df.shape[0]

530201

## 5.3 Saving the subset

In [47]:
top_5_percentile_df.to_csv(os.path.join(interim_folder, 'corpus_subset.csv'), index=False)