<a href="https://colab.research.google.com/github/paramate-p/selected-topic-cmu/blob/main/lecture04/workshop4/workshop4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install kaggle



In [2]:
! mkdir ~/.kaggle

In [3]:
! cp kaggle.json ~/.kaggle/

In [4]:
! chmod 600 ~/.kaggle/kaggle.json

## Download Data

In [5]:
! kaggle datasets download uciml/sms-spam-collection-dataset

Dataset URL: https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset
License(s): unknown
Downloading sms-spam-collection-dataset.zip to /content
  0% 0.00/211k [00:00<?, ?B/s]
100% 211k/211k [00:00<00:00, 100MB/s]


In [6]:
import pandas as pd
import zipfile

# Unzip the dataset
zip_path = './sms-spam-collection-dataset.zip'
extract_path = '.'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

## Import Data

In [7]:
# Detect the Encoding
import chardet

with open("./spam.csv", 'rb') as f:
    result = chardet.detect(f.read())
    print(result)

{'encoding': 'Windows-1252', 'confidence': 0.7269493857068697, 'language': ''}


In [8]:
df = pd.read_csv("./spam.csv", encoding='Windows-1252')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [9]:
df = df.rename({"v1":"label","v2":"text"}, axis=1)
df = df[["label","text"]]
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Text Preprocessing

In [10]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [11]:
ps = PorterStemmer()
wordnet = WordNetLemmatizer()

def stop_words(text):
  stop_words = set(stopwords.words('english'))

  words = word_tokenize(text)
  words = [word for word in words if word not in stop_words]
  text = ' '.join(words)
  return text

def clean_text(text):
  text = re.sub(r'[^a-zA-Z]',' ',text)
  text = text.lower()
  text = text.split()
  text = [ps.stem(word) for word in text if not word in stopwords.words('english')]
  text = ' '.join(text)
  return text

In [12]:
df['clean_text'] = df['text'].apply(lambda x: stop_words(x))
df['clean_text'] = df['clean_text'].apply(lambda x: clean_text(x))
df.head()

Unnamed: 0,label,text,clean_text
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri wkli comp win fa cup final tkt st m...
3,ham,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah n think goe usf live around though


### TFIDF

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
X = cv.fit_transform(df['clean_text']).toarray()

In [16]:
X.shape

(5572, 6216)

### Feature Selection using Variance threshold

In [27]:
from sklearn.feature_selection import VarianceThreshold

var_thres = VarianceThreshold(threshold=0.1)
X_selected = var_thres.fit_transform(X)


selected_features = var_thres.get_support(indices=True)
selected_terms = [list(cv.get_feature_names_out())[i] for i in selected_features]


# print("Original features:", cv.get_feature_names_out())
# print("Selected features:", selected_terms)
print("Number of removed features:", X.shape[1] - len(selected_terms))

Number of removed features: 6212


In [31]:
# Apply stratified hold-out with 70:30 ratio, with no shuffle, random
from sklearn.model_selection import train_test_split

state = 1234
# If shuffle=False then stratify must be None
X_train, X_test, y_train, y_test = train_test_split(X_selected, df['label'], test_size=0.3, random_state=state, stratify=df['label'])
print("The shape of matrix for train and test set: ",X_train.shape, y_train.shape, X_test.shape, y_test.shape)

The shape of matrix for train and test set:  (3900, 4) (3900,) (1672, 4) (1672,)


In [33]:
print("Top 10 rows of train set")
print(X_train[:10])
print("Top 10 rows of test set")
print(X_test[:10])

Top 10 rows of train set
[[0 0 0 1]
 [0 0 0 2]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [1 0 0 0]
 [0 0 0 1]]
Top 10 rows of test set
[[0 0 0 0]
 [0 0 0 1]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]]


In [34]:
print("Buttom 10 rows of train set")
print(X_train[-10:])
print("Buttom 10 rows of test set")
print(X_test[-10:])

Buttom 10 rows of train set
[[1 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]]
Buttom 10 rows of test set
[[0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 1]]
