# Preprocessing

In [28]:
pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [76]:
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import contractions

from sklearn.model_selection import train_test_split

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from sklearn.metrics import balanced_accuracy_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [30]:
df_train = pd.read_csv("datasets/sms_train.csv")
df_train.head()

Unnamed: 0,label,message
0,0,I dont. Can you send it to me. Plus how's mode.
1,0,Or i go home first lar ü wait 4 me lor.. I put down my stuff first..
2,0,"Me, i dont know again oh"
3,0,"I'll see, but prolly yeah"
4,0,"Night has ended for another day, morning has come in a special way. May you smile like the sunny rays and leaves your worries at the blue blue bay. Gud mrng"


In [31]:
!pip install scikit-learn --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [32]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(df_train['message'])
vocabulary = vectorizer.vocabulary_
features = list(vocabulary.keys())
num_features = len(features)
print("Number of features (unique words):", num_features)

Number of features (unique words): 7913


In [33]:
df_test = pd.read_csv("datasets/sms_test.csv")
df_test.head()

Unnamed: 0,message
0,"Yo, you at jp and hungry like a mofo?"
1,It's é only $140 ard...É rest all ard $180 at least...Which is é price 4 é 2 bedrm ($900)
2,"&lt;#&gt; , that's all? Guess that's easy enough"
3,Y?WHERE U AT DOGBREATH? ITS JUST SOUNDING LIKE JAN C THATS AL!!!!!!!!!
4,"Good afternoon sexy buns! How goes the job search ? I wake and you are my first thought as always, my love. I wish your fine and happy and know I adore you!"


## Case Folding

In [34]:
df_train['message'] = df_train['message'].str.lower()
df_train.head()

Unnamed: 0,label,message
0,0,i dont. can you send it to me. plus how's mode.
1,0,or i go home first lar ü wait 4 me lor.. i put down my stuff first..
2,0,"me, i dont know again oh"
3,0,"i'll see, but prolly yeah"
4,0,"night has ended for another day, morning has come in a special way. may you smile like the sunny rays and leaves your worries at the blue blue bay. gud mrng"


In [35]:
vectorizer = CountVectorizer()
vectorizer.fit(df_train['message'])
vocabulary = vectorizer.vocabulary_
features = list(vocabulary.keys())
num_features = len(features)
print("Number of features (unique words):", num_features)

Number of features (unique words): 7913


In [36]:
df_test['message'] = df_test['message'].str.lower()
df_test.head()

Unnamed: 0,message
0,"yo, you at jp and hungry like a mofo?"
1,it's é only $140 ard...é rest all ard $180 at least...which is é price 4 é 2 bedrm ($900)
2,"&lt;#&gt; , that's all? guess that's easy enough"
3,y?where u at dogbreath? its just sounding like jan c thats al!!!!!!!!!
4,"good afternoon sexy buns! how goes the job search ? i wake and you are my first thought as always, my love. i wish your fine and happy and know i adore you!"


## Cleansing/Punctuation Removal

In [37]:
def cleansing(messages):
  clean_message = re.sub('(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)|([^A-Za-z \t])|(\w+:\/\/\S+)','', messages)
  return clean_message

In [38]:
df_train['message'] = df_train['message'].apply(cleansing)
df_train.head()

Unnamed: 0,label,message
0,0,i dont can you send it to me plus hows mode
1,0,or i go home first lar wait me lor i put down my stuff first
2,0,me i dont know again oh
3,0,ill see but prolly yeah
4,0,night has ended for another day morning has come in a special way may you smile like the sunny rays and leaves your worries at the blue blue bay gud mrng


In [39]:
df_test['message'] = df_test['message'].apply(cleansing)
df_test.head()

Unnamed: 0,message
0,yo you at jp and hungry like a mofo
1,its only ard rest all ard at leastwhich is price bedrm
2,ltgt thats all guess thats easy enough
3,ywhere u at dogbreath its just sounding like jan c thats al
4,good afternoon sexy buns how goes the job search i wake and you are my first thought as always my love i wish your fine and happy and know i adore you


## Normalization

In [40]:
#Normalization
def normalize(messages):
  return contractions.fix(messages)

In [41]:
df_train['message'] = df_train['message'].apply(normalize)
df_train.head()

Unnamed: 0,label,message
0,0,i do not can you send it to me plus how is mode
1,0,or i go home first lar wait me lor i put down my stuff first
2,0,me i do not know again oh
3,0,ill see but probably yeah
4,0,night has ended for another day morning has come in a special way may you smile like the sunny rays and leaves your worries at the blue blue bay gud mrng


In [42]:
df_test['message'] = df_test['message'].apply(normalize)
df_test.head()

Unnamed: 0,message
0,yo you at jp and hungry like a mofo
1,its only ard rest all ard at leastwhich is price bedrm
2,ltgt that is all guess that is easy enough
3,ywhere you at dogbreath its just sounding like jan c that is al
4,good afternoon sexy buns how goes the job search i wake and you are my first thought as always my love i wish your fine and happy and know i adore you


## Tokenization

In [43]:
def tokenize(messages):
    return word_tokenize(messages)

In [44]:
df_train['message'] = df_train['message'].apply(tokenize)
df_train.head()

Unnamed: 0,label,message
0,0,"[i, do, not, can, you, send, it, to, me, plus, how, is, mode]"
1,0,"[or, i, go, home, first, lar, wait, me, lor, i, put, down, my, stuff, first]"
2,0,"[me, i, do, not, know, again, oh]"
3,0,"[ill, see, but, probably, yeah]"
4,0,"[night, has, ended, for, another, day, morning, has, come, in, a, special, way, may, you, smile, like, the, sunny, rays, and, leaves, your, worries, at, the, blue, blue, bay, gud, mrng]"


In [45]:
df_test['message'] = df_test['message'].apply(tokenize)
df_test.head()

Unnamed: 0,message
0,"[yo, you, at, jp, and, hungry, like, a, mofo]"
1,"[its, only, ard, rest, all, ard, at, leastwhich, is, price, bedrm]"
2,"[ltgt, that, is, all, guess, that, is, easy, enough]"
3,"[ywhere, you, at, dogbreath, its, just, sounding, like, jan, c, that, is, al]"
4,"[good, afternoon, sexy, buns, how, goes, the, job, search, i, wake, and, you, are, my, first, thought, as, always, my, love, i, wish, your, fine, and, happy, and, know, i, adore, you]"


## Stopword Removal

In [46]:
def stop_removal(messages):
  words = stopwords.words('english')
  words = set(words)
  return [word for word in messages if word not in words]

In [47]:
df_train['message']  = df_train['message'] .apply(stop_removal)
df_train.head()

Unnamed: 0,label,message
0,0,"[send, plus, mode]"
1,0,"[go, home, first, lar, wait, lor, put, stuff, first]"
2,0,"[know, oh]"
3,0,"[ill, see, probably, yeah]"
4,0,"[night, ended, another, day, morning, come, special, way, may, smile, like, sunny, rays, leaves, worries, blue, blue, bay, gud, mrng]"


In [48]:
df_test['message']  = df_test['message'] .apply(stop_removal)
df_test.head()

Unnamed: 0,message
0,"[yo, jp, hungry, like, mofo]"
1,"[ard, rest, ard, leastwhich, price, bedrm]"
2,"[ltgt, guess, easy, enough]"
3,"[ywhere, dogbreath, sounding, like, jan, c, al]"
4,"[good, afternoon, sexy, buns, goes, job, search, wake, first, thought, always, love, wish, fine, happy, know, adore]"


## Lemmatization

In [49]:
def lemmatize_pos(words, pos):
  lemmatizer = WordNetLemmatizer()
  
  pos_mapping = {
      'N': 'n',
        'V': 'v',
        'R': 'r',
        'J': 'a'
  }

  wordnet_pos = pos_mapping.get(pos[0], 'n')
  if wordnet_pos == 'n':
    return lemmatizer.lemmatize(words, pos='v')
  return lemmatizer.lemmatize(words, pos=wordnet_pos)

In [50]:
df_train['message']  = df_train['message'] .apply(lambda x: [lemmatize_pos(words, pos) for words, pos in pos_tag(x)])
df_train.head()

Unnamed: 0,label,message
0,0,"[send, plus, mode]"
1,0,"[go, home, first, lar, wait, lor, put, stuff, first]"
2,0,"[know, oh]"
3,0,"[ill, see, probably, yeah]"
4,0,"[night, end, another, day, morning, come, special, way, may, smile, like, sunny, ray, leave, worry, blue, blue, bay, gud, mrng]"


In [51]:
all_tokens = [token for sentence in df_train['message'] for token in sentence]
num_features = len(set(all_tokens))
print("Number of features (unique words):", num_features)

Number of features (unique words): 6784


In [52]:
df_test['message']  = df_test['message'] .apply(lambda x: [lemmatize_pos(words, pos) for words, pos in pos_tag(x)])
df_test.head()

Unnamed: 0,message
0,"[yo, jp, hungry, like, mofo]"
1,"[ard, rest, ard, leastwhich, price, bedrm]"
2,"[ltgt, guess, easy, enough]"
3,"[ywhere, dogbreath, sound, like, jan, c, al]"
4,"[good, afternoon, sexy, buns, go, job, search, wake, first, think, always, love, wish, fine, happy, know, adore]"


# TF-IDF

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer 

In [53]:
def join_text_list(texts):
    return ' '.join([text for text in texts])

In [61]:
df_train["final message"] = df_train["message"].apply(join_text_list)
df_train.head()

Unnamed: 0,label,message,final message
0,0,"[send, plus, mode]",send plus mode
1,0,"[go, home, first, lar, wait, lor, put, stuff, first]",go home first lar wait lor put stuff first
2,0,"[know, oh]",know oh
3,0,"[ill, see, probably, yeah]",ill see probably yeah
4,0,"[night, end, another, day, morning, come, special, way, may, smile, like, sunny, ray, leave, worry, blue, blue, bay, gud, mrng]",night end another day morning come special way may smile like sunny ray leave worry blue blue bay gud mrng


In [60]:
df_test["final message"] = df_test["message"].apply(join_text_list)
df_test.head()

Unnamed: 0,message,final message
0,"[yo, jp, hungry, like, mofo]",yo jp hungry like mofo
1,"[ard, rest, ard, leastwhich, price, bedrm]",ard rest ard leastwhich price bedrm
2,"[ltgt, guess, easy, enough]",ltgt guess easy enough
3,"[ywhere, dogbreath, sound, like, jan, c, al]",ywhere dogbreath sound like jan c al
4,"[good, afternoon, sexy, buns, go, job, search, wake, first, think, always, love, wish, fine, happy, know, adore]",good afternoon sexy buns go job search wake first think always love wish fine happy know adore


In [68]:
tf_idf_train = TfidfVectorizer(binary=True)
tfidf_mat_train = tf_idf_train.fit_transform(df_train["final message"]).toarray()
tfidf_train = pd.DataFrame(tfidf_mat_train, columns=tf_idf_train.get_feature_names_out())
tfidf_train.head()

Unnamed: 0,aa,aah,aaniye,aaooooright,aathilove,aathiwhere,ab,abbey,abdomen,abeg,...,zed,zf,zhong,zindgi,zoe,zogtorius,zoom,zouk,zs,zyada
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [69]:
tf_idf_test = TfidfVectorizer(binary=True)
tfidf_mat_test = tf_idf_test.fit_transform(df_test["final message"]).toarray()
tfidf_test = pd.DataFrame(tfidf_mat_test, columns=tf_idf_test.get_feature_names_out())
tfidf_test.head()

Unnamed: 0,aathilove,aathiwhere,abi,abiola,able,abnormally,aboutas,abta,aburo,acc,...,yr,yrs,yummy,yun,yuo,yup,yupz,ywhere,zed,zero
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.445673,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [109]:
#X = tfidf_train
#X[0:5]

In [110]:
#y = df_train['label']
#y

# Train Validation Split

In [111]:
#X_train, X_valid, y_train, y_valid = train_test_split(
#    X, 
#    y,
#    test_size=0.2,
#    shuffle=True,
#)

In [112]:
#print("Train set: ", X_train.shape, y_train.shape)
#print("Validation set: ", X_valid.shape, y_valid.shape)