In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split

dataset_path = 'spam.csv'
df = pd.read_csv(dataset_path, encoding='ISO-8859-1').drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['v2'])

X_tfidf_array = X_tfidf.toarray()

feature_variances = X_tfidf_array.var(axis=0)

print("Feature Variances:")
print(feature_variances)

# No feature in X meets the variance threshold 0.10000
chosen_threshold = 0.001 

variance_selector = VarianceThreshold(threshold=chosen_threshold)
X_selected = variance_selector.fit_transform(X_tfidf)

removed_features_count = X_tfidf.shape[1] - X_selected.shape[1]
print(f"Number of features removed with variance threshold: {removed_features_count}")

X_train, X_test, y_train, y_test = train_test_split(
    X_selected, df['v1'], test_size=0.3, random_state=1234, stratify=df['v1'], shuffle=True
)

print(f"Shape of the train set: {X_train.shape}")
print(f"Shape of the test set: {X_test.shape}")

top_and_bottom_10_rows = pd.concat([df.head(10), df.tail(10)])
print("Top and Bottom 10 Rows:")
print(top_and_bottom_10_rows)





Feature Variances:
[9.03671768e-05 3.27686563e-04 9.69487312e-06 ... 1.49891048e-05
 1.99136446e-04 2.32359852e-05]
Number of features removed with variance threshold: 8511
Shape of the train set: (3900, 161)
Shape of the test set: (1672, 161)
Top and Bottom 10 Rows:
        v1                                                 v2
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
5     spam  FreeMsg Hey there darling it's been 3 week's n...
6      ham  Even my brother is not like to speak with me. ...
7      ham  As per your request 'Melle Melle (Oru Minnamin...
8     spam  WINNER!! As a valued network customer you have...
9     spam  Had your mobile 11 months or more? U R entitle...
5562   ham  Ok lor... Sony ericsson salesman... I 

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
