# <center> **Text Mining Project: Final Solution.**

___

Group Number: 12 
- Omar Jarir [m20201378@novaims.unl.pt]  
- Chung-Ting Huang [m20210437@novaims.unl.pt] 

___

In our experiment, we found MLP using TF-IDF feature has the best result

# **1. Data import**

___

In [1]:
import requests as rq
from io import BytesIO

from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.neural_network import MLPClassifier
from tqdm import tqdm

In [2]:
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
random_state = 2022

In [4]:
url_train = "https://raw.githubusercontent.com/omarja12/Text_Mining/main/train.csv"
url_test = "https://raw.githubusercontent.com/omarja12/Text_Mining/main/test.csv"
data_train = rq.get(url_train).content
data_test = rq.get(url_test).content
ds_train = pd.read_csv(BytesIO(data_train))
ds_test = pd.read_csv(BytesIO(data_test))

In [5]:
ds_train.drop(columns=['Id'], inplace=True)
ds_test.drop(columns=['Id'], inplace=True)
ds_train[['Headline3', 'Headline11', 'Headline23']] = \
          ds_train[['Headline3', 'Headline11', 'Headline23']]. fillna(" ")
y_train = np.array(ds_train['Closing Status'])

In [6]:
stop = set(stopwords.words('english')).union(('U.S.', 'say', 'Say', 'says', 'year', 'new', 'New'))
# Alternatively we can use SnowballStemmer
stemmer = PorterStemmer()
lemma = WordNetLemmatizer()

# **2. Text cleaning:**

___ 

In [7]:
def clean(text_list, lemmatize = True, stemming = False):
    '''
    Clean the corpus by:
    1. Change all words to lower case.
    2. Remove numbers, punctuation, tags, hashtags, links, abbreviations and white spaces.
    3. Remove stop words.
    4. Lemmatize.
    5. Stemming.
    '''
    updates = []
    
    for j in tqdm(text_list):
        
        text = j
        
        #LOWERCASE TEXT
        text = text.lower()
        
        #REMOVE NUMERICAL DATA and PUNCTUATION
        text = re.sub("[^a-zA-Z]"," ", text )

        # TRANSFORM WORDS.
        text = re.sub(r"n't", " not ", text)
        text = re.sub(r"\'s", " ", text)
        text = re.sub(r"\'ve", " have ", text)
        text = re.sub(r"\'re", " are ", text)
        text = re.sub(r"\'d", " would ", text)
        text = re.sub(r"\'ll", " will ", text)

        # REMOVING HASHTAGS.
        text = re.sub("@[A-Za-z0-9_]+"," ", text)
        text = re.sub("#[A-Za-z0-9_]+"," ", text)
          
        # REMOVING NUMBERS FROM TEXT
        text = " ".join([word for word in text.split() if not word.isdigit()])
                
        # REMOVE TAGS
        text = BeautifulSoup(text).get_text(separator=' ')
        
        #REMOVE STOP WORDS
        text = " ".join([word for word in text.split() if word not in stop])
        
        if lemmatize == True:
            text = " ".join([lemma.lemmatize(word, pos='n') for word in text.split()])
        
        if stemming == False:
            text = " ".join([stemmer.stem(word) for word in text.split()])     
            
         # REMOVING SINGLE CHARACTER WORDS.
        text = re.sub(r"\b[a-zA-Z]\b", " ", text)
        
        # Removing spaces
        text = " ".join(text.split())
        
        if len(text)== 0:
            text = " "
            
        updates.append(text)    
        
    return updates

**Corpus cleaning:**

In [8]:
ds_train_clean = pd.DataFrame(columns=ds_train.columns)
ds_test_clean = pd.DataFrame(columns=ds_test.columns)
Headlines = list(ds_test.columns[0:])
for col in Headlines:
    clean_columns = clean(ds_train[col], lemmatize = True, stemming = False) 
    ds_train_clean[col] = clean_columns
    clean_columns = clean(ds_test[col], lemmatize = True, stemming = False) 
    ds_test_clean[col] = clean_columns

100%|██████████| 1690/1690 [00:04<00:00, 375.79it/s]
100%|██████████| 299/299 [00:00<00:00, 664.83it/s]
100%|██████████| 1690/1690 [00:02<00:00, 690.19it/s]
100%|██████████| 299/299 [00:00<00:00, 660.42it/s]
100%|██████████| 1690/1690 [00:02<00:00, 567.82it/s]
100%|██████████| 299/299 [00:00<00:00, 474.12it/s]
100%|██████████| 1690/1690 [00:04<00:00, 418.14it/s]
100%|██████████| 299/299 [00:00<00:00, 580.92it/s]
100%|██████████| 1690/1690 [00:02<00:00, 616.02it/s]
100%|██████████| 299/299 [00:00<00:00, 705.59it/s]
100%|██████████| 1690/1690 [00:02<00:00, 764.46it/s]
100%|██████████| 299/299 [00:00<00:00, 736.87it/s]
100%|██████████| 1690/1690 [00:02<00:00, 797.63it/s]
100%|██████████| 299/299 [00:00<00:00, 773.05it/s]
100%|██████████| 1690/1690 [00:02<00:00, 746.23it/s]
100%|██████████| 299/299 [00:00<00:00, 714.01it/s]
100%|██████████| 1690/1690 [00:02<00:00, 740.99it/s]
100%|██████████| 299/299 [00:00<00:00, 702.28it/s]
100%|██████████| 1690/1690 [00:02<00:00, 721.09it/s]
100%|██████

___

**Removing Common Words That Are Both Positive and Negative**

In [9]:
def TopnGrams(corpus, top_k, n):
    '''
    Returns a dataframe of ngrams
    '''
    vec = CountVectorizer(ngram_range=(n, n), max_features=2000).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = []
    for word, idx in vec.vocabulary_.items():
        words_freq.append((word, sum_words[0, idx]))
        
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    top_df = pd.DataFrame(words_freq[:top_k])
    top_df.columns = ["Ngram", "Freq"]
    return top_df
    
def top_common_words(train_pos, train_neg):
    pos_ngrams = TopnGrams(train_pos, n=1, top_k=100)
    neg_ngrams = TopnGrams(train_neg, n=1, top_k=100)
    return pd.merge(pos_ngrams, neg_ngrams, on='Ngram')

def remove_common_words(text_list, top_words):
    '''
    Remove top words that appear in positive and negative corpus
    '''
    updates = []
    for j in tqdm(text_list):
        text = j
        
        #REMOVE COMMON WORDS.
        text = " ".join([word for word in text.split() if word not in top_words['Ngram'].to_list()])
        
        updates.append(text)    
        
    return updates

In [10]:
ds_train_clean['HeadlinesTotal'] = \
                 ds_train_clean[Headlines].apply(lambda row: " ".join(row.values.astype(str)), axis=1)
ds_test_clean['HeadlinesTotal'] = \
                 ds_test_clean[Headlines].apply(lambda row: " ".join(row.values.astype(str)), axis=1)

train_pos = ds_train_clean.loc[ds_train['Closing Status'] == 1, 'HeadlinesTotal']
train_neg = ds_train_clean.loc[ds_train['Closing Status'] == 0, 'HeadlinesTotal']

top_words = top_common_words(train_pos=train_pos, train_neg=train_neg)
clean_columns = remove_common_words(ds_test_clean['HeadlinesTotal'], top_words)
ds_test_clean['HeadlinesTotal'] = clean_columns

100%|██████████| 299/299 [00:00<00:00, 484.09it/s]


# **3. Feature Engineering:**

***

**Creating TF-IDF Features:**

In [11]:
# Vectorization parameters:

# Range (inclusive) of n-gram sizes for tokenizing text.
NGRAM_RANGE = (1, 2)

# Limit on the number of features. We use the top 20K features.
TOP_K = 300

# Whether text should be split into word or character n-grams.
# One of 'word', 'char'.
TOKEN_MODE = 'word'

# Minimum and Maximum document/corpus frequency below which a token will be discarded.
MIN_DOCUMENT_FREQUENCY = 10
MAX_DOCUMENT_FREQUENCY = 0.8
MAX_FEATURES=30000

def tfidf(train_texts, train_labels, test_texts):
    # Create keyword arguments to pass to the 'tf-idf' vectorizer.
    kwargs = {
            'ngram_range': NGRAM_RANGE,  # Use 1-grams + 2-grams.
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': TOKEN_MODE,  # Split text into word tokens.
            'max_df': MAX_DOCUMENT_FREQUENCY,  
            'max_features': MAX_FEATURES,
            'min_df': MIN_DOCUMENT_FREQUENCY,
    }
    vectorizer = TfidfVectorizer(**kwargs)

    # Learn vocabulary from training texts and vectorize training texts.
    x_train = vectorizer.fit_transform(train_texts)

    # Vectorize test texts.
    x_test = vectorizer.transform(test_texts)

    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(TOP_K, x_train.shape[1]))
    selector.fit(x_train, train_labels)
    x_train = selector.transform(x_train).astype('float64')
    x_test = selector.transform(x_test).astype('float64')
    return x_train.toarray(), x_test.toarray()

In [12]:
y_train = np.array(ds_train['Closing Status'])
X_tfidf, X_tfidf_test = tfidf(ds_train_clean["HeadlinesTotal"], y_train, ds_test_clean["HeadlinesTotal"])

# **4. Training the model:**

***

MLP was our best performing model.

In [13]:
mlp = MLPClassifier(solver='adam', hidden_layer_sizes=(80,60,40), 
                         early_stopping=True,
                         activation='relu', random_state=random_state)
mlp.fit(X_tfidf, y_train)
pred = pd.DataFrame(index=ds_test.index, columns=['prediction'])
pred['prediction'] = mlp.predict(X_tfidf_test)
pred.to_csv('Predictions_12.csv', sep=",")
pred

Unnamed: 0,prediction
0,1
1,1
2,1
3,1
4,1
...,...
294,1
295,1
296,0
297,1


# <center> **THE END.**