# Pre-processing Asg

---



* Tokenization
* Filtering
* Lemmatization/Stemming


In [20]:
import re
import nltk
import spacy
import string
import random
import chardet
import numpy as np
import collections
import pandas as pd
import seaborn as sns
import en_core_web_sm 
from sklearn import metrics
import matplotlib.pyplot as plt
from scipy.stats import wilcoxon
from nltk.tokenize import word_tokenize, regexp_tokenize
from sklearn.svm import SVR, SVC, LinearSVC
from matplotlib.ticker import PercentFormatter
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from torch.utils.data.dataset import random_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer

In [83]:
# it gave UnicodeDecode error with encoding utf-8 
red_path = "red.csv"
red_file = pd.read_csv(red_path, encoding="latin-1")
print("red_file cols:", red_file.columns)
red = pd.DataFrame(red_file["title"]).rename(columns={"title":"Headline"})
print("Reddit: \n", red.head(5))


ireland_path = "ireland.csv"
ireland_file = pd.read_csv(ireland_path, encoding="latin-1")
print("\n ireland_file cols:", ireland_file.columns, "\n")
ireland = pd.DataFrame(ireland_file["headline_tokens"]).rename(columns={"headline_tokens":"Headline"})
print("Ireland: \n", ireland.head(5))

red_file cols: Index(['time_created', 'date_created', 'up_votes', 'down_votes', 'title',
       'over_18', 'author', 'subreddit'],
      dtype='object')
Reddit: 
                                           Headline
0                Scores killed in Pakistan clashes
1                 Japan resumes refuelling mission
2                  US presses Egypt on Gaza border
3     Jump-start economy: Give health care to all 
4  Council of Europe bashes EU&UN terror blacklist

 ireland_file cols: Index(['publish_link', 'headline_tokens'], dtype='object') 

Ireland: 
                                             Headline
0  that last jedi reveal about reys parents is ac...
1  julianne hough and husband brooks laich hold h...
2        ellie goulding sizzles in a nude photoshoot
3  laurence fox and lilah parsons hint they are i...
4               the best wireless workout headphones


In [95]:
# Pre-processing Reddit News

# This catches things like %, $, ellipses or abreviations
pattern = r'''(?x)    
(?:[A-Z]\.)+          
|\w+(?:-\w+)*         
|\$?\d+(?:\.\d+)?%?   
|\.\.\.               
|[][.,;"\'?():-_`]    
'''

def preprocessing(inpf):
    """Input: path to text file (text)
       Output: preprocessed txt file"""
    # specify column
    Headline = inpf["Headline"].head(100) #this takes the first 100 rows 
                                        # I used this in the beginnng to run functions quickly
        
    #specify stopwords
    stopwords = nltk.corpus.stopwords.words('english')
    signs = ["\\",'/']
    
    # url tag
    urlremover = re.compile(r'http\S+')
    # html tag remover
    htmlremover = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    
    # initialize list of lists (each list is tweet)
    # each word is string sepparated by comma
    tokenized = []
    
    # list of strings, each headline is a string sepparated by comma
    third = []
    
    for line in Headline:
    # Filtering: Remove unecessary character
        line = line.strip().lower().translate(line.maketrans("", "", string.punctuation))
        #print("aaa",line, type(line))
        line = re.sub(r'[0-9]+', '', line)
        
    # Tokenizing
        #line = word_tokenize(line, pattern)
        line = regexp_tokenize(line, pattern)
        
    # store in lines in list of lines
        tokenized.append(line)
        
    # example of a tokenized headline
    print("\n", "tokenized:", tokenized[15])
    
    for line in tokenized:
        
        lemmatized = []
        
        for word in line:

            if ((len(word) > 2 and word.isalpha()) and (not word in stopwords) and (not word in signs)):
            
                lemma = nltk.WordNetLemmatizer().lemmatize(word)
                
                token2 = word_tokenize(lemmatized.append(word))
                
                third.append(token2)
    
        # removes Nans and errors
        if lemmatized != []:
            stuff = " ".join(lemmatized)
            third.append(stuff)
            
    # example of a tokenized headline   
    #print("\n", "third:", third[15])
    
    ppdf = pd.DataFrame(tokenized, columns = ["Headline"])
    return ppdf

red_pp = preprocessing(red)
ireland_pp = preprocessing(ireland)
#print("\n", "Reddit Pre-processed:", "\n", red_pp.head(5))
#print("\n", "Ireland Pre-processed:", "\n", ireland_pp.head(5))


 tokenized: ['nicolas', 'sarkozy', 'angela', 'merkel', 'confirm', 'their', 'opposition', 'to', 'turkey', 'being', 'eu', 'membership']


TypeError: expected string or bytes-like object

In [None]:
    #print("lala", third[2])
    #print("lala", third.head(2))
    #print("third lal ", third)

In [79]:
nltk.WordNetLemmatizer().lemmatize("players")

'player'

---

## Splitting data into a train and a test set 
80% for training and 20% for testing.
Data is shuffled.

In [68]:
# Train Test Split

train_red, test_red = train_test_split(red_pp, test_size=0.20, random_state=42)
print(train_red.head(5))

train_ireland, test_ireland = train_test_split(ireland_pp, test_size=0.20, random_state=42)
print("\n", train_ireland.head(5))

                                             Headline
55                     israel plan egypt border fence
88                 smoking could kill billion century
26  germany reject troop request southern afghanistan
42            six killed israeli airstrike hamas base
69  fukuda get letter putin indicating resolve isl...

                                              Headline
55       gop tax bill make lot sense featured shining
88  tamar braxtons husband denies getting laura go...
26    sniper saint showing iraqi shiite militia power
42                          drove car favorite engine
69           atiku david mark reveals happen nigerian


---