## M161 first question notebook, Knn with jaccard distance classifier
## Data preprocessing
### Data cleaning I
 1. check types 
 2. check for null values
 3. check duplicates
 4. keeping 10000 instances to reduce computation load

In [1]:
import pandas as pd
file_path = 'bigdata2025classification/train.csv'

def load_and_process_data(file_path):
    # Load data from a CSV file
    dataTrain = pd.read_csv(file_path)

    print("Data loaded successfully.")
    print("First 5 rows of the dataset:")
    print(dataTrain.head())

    print("\nData summary:")
    print(dataTrain.info())

    # Check for missing values in the dataframe
    print("\nMissing values in each column:")
    print(dataTrain.isnull().sum())
    
    return dataTrain

dataTrain = load_and_process_data(file_path)

# check column data types
def check_column_types(dataTrain):
    print("\nColumn data types:")
    print(dataTrain.dtypes)

check_column_types(dataTrain)





Data loaded successfully.
First 5 rows of the dataset:
       Id                                              Title  \
0  227464  Netflix is coming to cable boxes, and Amazon i...   
1  244074  Pharrell, Iranian President React to Tehran 'H...   
2   60707                    Wildlife service seeks comments   
3   27883  Facebook teams up with Storyful to launch 'FB ...   
4  169596           Caesars plans US$880 mln New York casino   

                                             Content          Label  
0   if you subscribe to one of three rinky-dink (...  Entertainment  
1   pharrell, iranian president react to tehran '...  Entertainment  
2   the u.s. fish and wildlife service has reopen...     Technology  
3   the very nature of social media means it is o...     Technology  
4   caesars plans us$880 mln new york casino jul ...       Business  

Data summary:
<class 'pandas.DataFrame'>
RangeIndex: 111795 entries, 0 to 111794
Data columns (total 4 columns):
 #   Column   Non-Null Cou

In [2]:
# Keep only the first 3000 instances for faster experimentation
dataTrain = dataTrain.iloc[:3000].reset_index(drop=True)
print(f"Subset shape: {dataTrain.shape}")

Subset shape: (3000, 4)


## Data cleaning II continue
3. check for duplicates
***************************
### note
- the data types of all exeprt Id column is "object" in pandas, it works, but could be converted to String for a performance uplift.
*****************************

In [3]:
# Check for duplicate rows in the dataframe
def check_duplicates(dataTrain):
    duplicate_count = dataTrain.duplicated().sum()
    print(f"\nNumber of duplicate rows: {duplicate_count}")
    return duplicate_count

check_duplicates(dataTrain)

# Check for duplicates based only on 'Title' column
def check_title_duplicates(dataTrain):
    if 'Title' in dataTrain.columns:
        dup_count = dataTrain.duplicated(subset=['Title']).sum()
        print(f"\nNumber of duplicate rows based on Title: {dup_count}")
        return dup_count
    else:
        print("'Title' column not found in the dataframe.")
        return None

check_title_duplicates(dataTrain)

# Check for duplicates based only on 'Content' column
def check_content_duplicates(dataTrain):
    if 'Content' in dataTrain.columns:
        dup_count = dataTrain.duplicated(subset=['Content']).sum()
        print(f"\nNumber of duplicate rows based on Content: {dup_count}")
        return dup_count
    else:
        print("'Content' column not found in the dataframe.")
        return None

check_content_duplicates(dataTrain)
# Check for duplicates based on 'Title' and 'Content' columns
def check_title_content_duplicates(dataTrain):
    if 'Title' in dataTrain.columns and 'Content' in dataTrain.columns:
        dup_count = dataTrain.duplicated(subset=['Title', 'Content']).sum()
        print(f"\nNumber of duplicate rows based on Title and Content: {dup_count}")
        return dup_count
    else:
        print("'Title' and/or 'Content' columns not found in the dataframe.")
        return None

check_title_content_duplicates(dataTrain)


Number of duplicate rows: 0

Number of duplicate rows based on Title: 5

Number of duplicate rows based on Content: 6

Number of duplicate rows based on Title and Content: 3


np.int64(3)

In [4]:
# Remove duplicates based on 'Title' and 'Content' columns, keeping the first occurrence
dataTrain = dataTrain.drop_duplicates(subset=['Title', 'Content'], keep='first')
print("\nDuplicates based on Title and Content removed. Data shape:", dataTrain.shape)


# Reset the index after removing duplicates
dataTrain = dataTrain.reset_index(drop=True)
print("\nIndex reset. Data shape:", dataTrain.shape)
dataTrain.info()


Duplicates based on Title and Content removed. Data shape: (2997, 4)

Index reset. Data shape: (2997, 4)
<class 'pandas.DataFrame'>
RangeIndex: 2997 entries, 0 to 2996
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   Id       2997 non-null   int64
 1   Title    2997 non-null   str  
 2   Content  2997 non-null   str  
 3   Label    2997 non-null   str  
dtypes: int64(1), str(3)
memory usage: 93.8 KB



### Remove words not in English dictionary

- **probably could change dictionary for better results but it works...**


In [5]:
import re
import nltk
from nltk.corpus import words

# Download the words corpus if not already present
nltk.download('words')
english_words = set(words.words())

def clean_text(text):
    # Split text into words
    word_list = re.findall(r'\b\w+\b', str(text))
    cleaned_words = []
    for word in word_list:
        # Drop any word not in dictionary
        if word.lower() not in english_words:
            continue
        # Drop words with 2+ repeating chars not in dictionary (redundant now, but kept for clarity)
        if re.search(r'(.)\1{1,}', word):
            if word.lower() not in english_words:
                continue
        cleaned_words.append(word)
    return ' '.join(cleaned_words)

# Apply to both columns
dataTrain['Title'] = dataTrain['Title'].apply(clean_text)
dataTrain['Content'] = dataTrain['Content'].apply(clean_text)


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\odys_\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


## Text clean up 
1. Expand contractions
2. Convert to lowercase
3. Remove special characters (keep only letters and spaces)
4. Remove extra spaces
5. Remove stopwords, lemmatize, and stem

**warning**
- takes up considerable time to execute

In [6]:
import re
import nltk
import contractions
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download required NLTK data if not already present
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    # Expand contractions
    text = contractions.fix(text)
    # Convert to lowercase
    text = text.lower()
    # Remove special characters (keep only letters and spaces)
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Tokenize
    words = text.split()
    # Remove stopwords and stem
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    text = ' '.join(words)
    return text

for col in ['Title', 'Content']:
    dataTrain[col] = dataTrain[col].astype(str).apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\odys_\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\odys_\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\odys_\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Just printing out the firtst 5 columns to see what happend to text

In [7]:
print(dataTrain.head())

       Id                       Title  \
0  227464  come cabl groceri overlord   
1  244074          presid react happi   
2   60707              wildlif servic   
3   27883                      launch   
4  169596          us new york casino   

                                             Content          Label  
0  subscrib one three dink compar speak cabl abl ...  Entertainment  
1  presid react happi singer presid took twitter ...  Entertainment  
2  fish wildlif servic comment period addit day p...     Technology  
3  natur social media often sourc real time break...     Technology  
4  us new york casino latest news top deck world ...       Business  


## Starting future extraction (converting text to numbers for ML algorythms to run)
- we should use **Bag of words** based on project requirements 
- Shoud take into account the title column in combination with content ()

### ℹ️info 

- countvetrorizer does convert everyting to lowecase and removes punctuation by default. could remove steps from above Nltk powered code cell
- Title and Content column are combined in to a sing string and then tokenized and vectorized

In [8]:
# Combine 'Title' and 'Content' columns into a single string
# and vectorize the result for classification
from sklearn.feature_extraction.text import CountVectorizer

# Create a new column that combines Title and Content
# (if either column is missing, fill with empty string)
dataTrain['Combined'] = dataTrain['Title'].fillna('') + ' ' + dataTrain['Content'].fillna('')

# Initialize CountVectorizer (Bag of Words)
bow_vectorizer = CountVectorizer(binary=True, max_features=5000)  # Use binary=True for presence/absence of words, limit to top 5000 features

# Fit and transform the combined column
dataTrain_bow = bow_vectorizer.fit_transform(dataTrain['Combined'])

# Show shape and a sample
print('Bag of Words matrix shape:', dataTrain_bow.shape)
print("dataTrain_bow sample (first 10 rows):", dataTrain_bow[:10].toarray())
print('Feature names (first 20):', bow_vectorizer.get_feature_names_out()[:20])

Bag of Words matrix shape: (2997, 5000)
dataTrain_bow sample (first 10 rows): [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Feature names (first 20): ['aa' 'abandon' 'abdomin' 'abid' 'abil' 'abl' 'aboard' 'abort' 'abroad'
 'abruptli' 'absenc' 'absent' 'absolut' 'absorb' 'absurd' 'abu' 'abund'
 'abus' 'academ' 'academi']


### visual check for weird words, repetitions, etc...
- remember stemming and lemmatization was executed on text

In [9]:
print('Feature names (first 100):', bow_vectorizer.get_feature_names_out()[:1000])

Feature names (first 100): ['aa' 'abandon' 'abdomin' 'abid' 'abil' 'abl' 'aboard' 'abort' 'abroad'
 'abruptli' 'absenc' 'absent' 'absolut' 'absorb' 'absurd' 'abu' 'abund'
 'abus' 'academ' 'academi' 'acceler' 'accent' 'accept' 'access'
 'accessori' 'accid' 'accident' 'accommod' 'accomplish' 'accord'
 'accordingli' 'account' 'accur' 'accuraci' 'accus' 'achiev' 'acid'
 'acknowledg' 'acoust' 'acquir' 'acquisit' 'across' 'act' 'action' 'activ'
 'activist' 'actor' 'actress' 'actual' 'acut' 'ad' 'adapt' 'add' 'addict'
 'addit' 'address' 'adequ' 'adher' 'adjac' 'adjust' 'administr' 'admir'
 'admiss' 'admit' 'adob' 'adolesc' 'adopt' 'ador' 'adult' 'advanc'
 'advantag' 'adventur' 'advers' 'advertis' 'advic' 'advis' 'advisor'
 'advisori' 'advoc' 'advocaci' 'aerial' 'aesthet' 'affair' 'affect'
 'affili' 'affirm' 'afford' 'afloat' 'afraid' 'aftermath' 'afternoon'
 'afterward' 'age' 'agenc' 'agenda' 'agent' 'aggreg' 'aggress' 'ago'
 'agre' 'agreement' 'agricultur' 'ah' 'ahead' 'ai' 'aid' 'ail' 'aim'

## KNN with jaccard distance classifier

In [10]:
# KNN classification with Jaccard distance and 5-fold cross-validation on Bag of Words features
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import jaccard_score, make_scorer
import numpy as np


# Use the same Bag of Words features as before
X = dataTrain_bow.toarray()
y = dataTrain['Label'].values

# Initialize KNN classifier with Jaccard distance (metric='jaccard')
knn_clf = KNeighborsClassifier(n_neighbors=5, metric='jaccard', n_jobs=-1)

# Perform 5-fold cross-validation using accuracy
cv_scores_acc = cross_val_score(knn_clf, X, y, cv=5, scoring='accuracy', n_jobs=-1)
print('KNN (Jaccard) 5-fold CV accuracy scores:', cv_scores_acc)
print('Mean CV accuracy:', np.mean(cv_scores_acc))



KNN (Jaccard) 5-fold CV accuracy scores: [0.875      0.86       0.87813022 0.86477462 0.85308848]
Mean CV accuracy: 0.8661986644407346
