# Text Mining -  Group Project

---

# Preparing the Data

In [107]:
import re
import nltk
import spacy
import string
import random
import chardet
import numpy as np
import collections
import pandas as pd
import seaborn as sns
import en_core_web_sm 
from sklearn import metrics
import matplotlib.pyplot as plt
from scipy.stats import wilcoxon
from sklearn.utils import shuffle
from nltk.tokenize import word_tokenize, regexp_tokenize
from sklearn.svm import SVR, SVC, LinearSVC
from matplotlib.ticker import PercentFormatter
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from torch.utils.data.dataset import random_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer

## Loading the Data

### Reddit News

In [3]:
reddit_path = "data/reddit_worldnews.csv"
reddit_file = pd.read_csv(reddit_path, encoding="utf-8", encoding_errors="ignore")
print("reddit_file_clmns:", reddit_file.columns)
reddit = pd.DataFrame(reddit_file["title"]).rename(columns={"title":"Headline"})
reddit.head()

reddit_file_clmns: Index(['time_created', 'date_created', 'up_votes', 'down_votes', 'title',
       'over_18', 'author', 'subreddit'],
      dtype='object')


Unnamed: 0,Headline
0,Scores killed in Pakistan clashes
1,Japan resumes refuelling mission
2,US presses Egypt on Gaza border
3,Jump-start economy: Give health care to all
4,Council of Europe bashes EU&UN terror blacklist


### Irland News

In [5]:
ireland_path = "data/ireland-news-headlines.csv"
ireland_file = pd.read_csv(ireland_path, encoding="utf-8", encoding_errors="ignore")
print("ireland_file_clmns:", ireland_file.columns)
ireland = pd.DataFrame(ireland_file["headline_text"]).rename(columns={"headline_text":"Headline"})
ireland.head()

ireland_file_clmns: Index(['publish_date', 'headline_category', 'headline_text'], dtype='object')


Unnamed: 0,Headline
0,UUP sees possibility of voting Major out
1,Pubs targeted as curbs on smoking are extended
2,Papers reveal secret links with O'Neill cabinet
3,Domestic chaos as Italy takes EU presidency
4,Learning about the star to which we owe life


In [7]:
type(ireland["Headline"])

pandas.core.series.Series

## Preprocessing

In [104]:
# Pre-processing 
# Regular expression used for tokenization
pattern = r'''(?x)    
(?:[A-Z]\.)+          
|\w+(?:-\w+)*         
|\$?\d+(?:\.\d+)?%?   
|\.\.\.               
|[][.,;"\'?():-_`]  
'''

# Lemmatizer used 
lemmatizer = nltk.WordNetLemmatizer()

def preprocessing(df):
    """Input: dataframe
       Output: preprocessed dataframe"""
    
    # Reduce amount of data for quicker training purposes
    # Headline = df["Headline"].head(100)
        
    # Get the stopwords and punctuation
    stopwords = nltk.corpus.stopwords.words('english')
    punct = list(string.punctuation)
    
    # Initialize tokenized list of headlines
    # Get list of headlines
    headlns_lst = df["Headline"].to_list()
    
    tokenized_lines = []
    for headln in headlns_lst:
        line = str(headln).strip().lower()
        line = regexp_tokenize(line, pattern)
        line = [tok for tok in line if tok not in stopwords and tok not in punct and tok.isalpha() and len(tok)>2]
        tokenized_lines.append(line)

    # Initialize lemmatized list of headlines
    final_pp = []
    for headln in tokenized_lines:
        lemmatized_line = []
        for token in headln:
            lemma = lemmatizer.lemmatize(token)
            lemmatized_line.append(lemma)
        final_pp.append(str(lemmatized_line))
    pp_df = pd.DataFrame(final_pp, columns=["Headline"])
    
    return pp_df

In [75]:
reddit_pp = preprocessing(reddit)
reddit_pp

Unnamed: 0,Headline
0,"['score', 'killed', 'pakistan', 'clash']"
1,"['japan', 'resume', 'refuelling', 'mission']"
2,"['press', 'egypt', 'gaza', 'border']"
3,"['economy', 'give', 'health', 'care']"
4,"['council', 'europe', 'bash', 'terror', 'black..."
...,...
509231,"['heil', 'trump', 'donald', 'trump', 'white', ..."
509232,"['people', 'speculating', 'could', 'madeleine'..."
509233,"['professor', 'receives', 'arab', 'researcher'..."
509234,"['nigel', 'farage', 'attack', 'response', 'tru..."


In [105]:
ireland_pp = preprocessing(ireland)
ireland_pp

Unnamed: 0,Headline
0,"['uup', 'see', 'possibility', 'voting', 'major']"
1,"['pub', 'targeted', 'curb', 'smoking', 'extend..."
2,"['paper', 'reveal', 'secret', 'link', 'neill',..."
3,"['domestic', 'chaos', 'italy', 'take', 'presid..."
4,"['learning', 'star', 'owe', 'life']"
...,...
1611490,"['reserve', 'member', 'defence', 'force', 'all..."
1611491,"['maureen', 'dowd', 'joe', 'biden', 'crazy', '..."
1611492,"['andy', 'murray', 'roll', 'back', 'year', 'ce..."
1611493,"['delta', 'variant', 'could', 'significant', '..."


In [None]:
# Need to use pre-trained embeddings since headlines not enough
# Should we cut out randomly some data of the ireland dataset so that we have equal amount or do we want to normalize in the
# end the amount of headlines per theme for the amount of data ?

---

## Splitting data into a train and a test set 
80% for training and 20% for testing.
Data is shuffled.

In [108]:
reddit_df = shuffle(reddit_pp, random_state=42)
ireland_df = shuffle(ireland_pp, random_state=42)

In [109]:
# Train Test Split
train_reddit, test_reddit = train_test_split(reddit_df, test_size=0.20, random_state=42)
train_ireland, test_ireland = train_test_split(ireland_df, test_size=0.20, random_state=42)

---

# Sentence Embeddings