## M161 first question notebook, prepare test data
## Data preprocessing
### Data cleaning I
 1. check types 
 2. check for null values
 3. check duplicates
 4. keeping 10000 instances to reduce computation load

In [2]:
import pandas as pd
file_path = 'bigdata2025classification/test_without_labels.csv'

def load_and_process_data(file_path):
    # Load data from a CSV file
    dataTrain = pd.read_csv(file_path)

    print("Data loaded successfully.")
    print("First 5 rows of the dataset:")
    print(dataTrain.head())

    print("\nData summary:")
    print(dataTrain.info())

    # Check for missing values in the dataframe
    print("\nMissing values in each column:")
    print(dataTrain.isnull().sum())
    
    return dataTrain

dataTrain = load_and_process_data(file_path)

# check column data types
def check_column_types(dataTrain):
    print("\nColumn data types:")
    print(dataTrain.dtypes)

check_column_types(dataTrain)





Data loaded successfully.
First 5 rows of the dataset:
       Id                                              Title  \
0  262120  Tracy Morgan upgraded to fair condition after ...   
1  175132  Smartphones Weigh on Samsung Electronics as Gu...   
2  218739  FBI denies fumbling testimony on 'X-Men' direc...   
3  253483  Bachelorette 2014 Spoilers: Week 3 Recap ??? E...   
4  224109  Barack Obama honours Frankie Knuckles in lette...   

                                             Content  
0   actor and comedian tracy morgan has been upgr...  
1  samsung electronics co ltd on tuesday issued u...  
2   michael f. egan iii said in a press conferenc...  
3   i am having mixed emotions for what is about ...  
4   u.s. president barack obama has paid a specia...  

Data summary:
<class 'pandas.DataFrame'>
RangeIndex: 47912 entries, 0 to 47911
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   Id       47912 non-null  int64
 1   Title

## Data cleaning II continue
3. check for duplicates
***************************
### note
- the data types of all exeprt Id column is "object" in pandas, it works, but could be converted to String for a performance uplift.
*****************************

In [3]:
# Check for duplicate rows in the dataframe
def check_duplicates(dataTrain):
    duplicate_count = dataTrain.duplicated().sum()
    print(f"\nNumber of duplicate rows: {duplicate_count}")
    return duplicate_count

check_duplicates(dataTrain)

# Check for duplicates based only on 'Title' column
def check_title_duplicates(dataTrain):
    if 'Title' in dataTrain.columns:
        dup_count = dataTrain.duplicated(subset=['Title']).sum()
        print(f"\nNumber of duplicate rows based on Title: {dup_count}")
        return dup_count
    else:
        print("'Title' column not found in the dataframe.")
        return None

check_title_duplicates(dataTrain)

# Check for duplicates based only on 'Content' column
def check_content_duplicates(dataTrain):
    if 'Content' in dataTrain.columns:
        dup_count = dataTrain.duplicated(subset=['Content']).sum()
        print(f"\nNumber of duplicate rows based on Content: {dup_count}")
        return dup_count
    else:
        print("'Content' column not found in the dataframe.")
        return None

check_content_duplicates(dataTrain)
# Check for duplicates based on 'Title' and 'Content' columns
def check_title_content_duplicates(dataTrain):
    if 'Title' in dataTrain.columns and 'Content' in dataTrain.columns:
        dup_count = dataTrain.duplicated(subset=['Title', 'Content']).sum()
        print(f"\nNumber of duplicate rows based on Title and Content: {dup_count}")
        return dup_count
    else:
        print("'Title' and/or 'Content' columns not found in the dataframe.")
        return None

check_title_content_duplicates(dataTrain)


Number of duplicate rows: 0

Number of duplicate rows based on Title: 251

Number of duplicate rows based on Content: 419

Number of duplicate rows based on Title and Content: 111


np.int64(111)

## Duplicates not removed from test case (it makes no sense for evaluation later)
*****************

In [None]:
# # Remove duplicates based on 'Title' and 'Content' columns, keeping the first occurrence
# dataTrain = dataTrain.drop_duplicates(subset=['Title', 'Content'], keep='first')
# print("\nDuplicates based on Title and Content removed. Data shape:", dataTrain.shape)


# # Reset the index after removing duplicates
# dataTrain = dataTrain.reset_index(drop=True)
# print("\nIndex reset. Data shape:", dataTrain.shape)
# dataTrain.info()


### Remove words not in English dictionary

- **probably could change dictionary for better results but it works...**


In [4]:
import re
import nltk
from nltk.corpus import words

# Download the words corpus if not already present
nltk.download('words')
english_words = set(words.words())

def clean_text(text):
    # Split text into words
    word_list = re.findall(r'\b\w+\b', str(text))
    cleaned_words = []
    for word in word_list:
        # Drop any word not in dictionary
        if word.lower() not in english_words:
            continue
        # Drop words with 2+ repeating chars not in dictionary (redundant now, but kept for clarity)
        if re.search(r'(.)\1{1,}', word):
            if word.lower() not in english_words:
                continue
        cleaned_words.append(word)
    return ' '.join(cleaned_words)

# Apply to both columns
dataTrain['Title'] = dataTrain['Title'].apply(clean_text)
dataTrain['Content'] = dataTrain['Content'].apply(clean_text)


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\odys_\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


## Text clean up 
1. Expand contractions
2. Convert to lowercase
3. Remove special characters (keep only letters and spaces)
4. Remove extra spaces
5. Remove stopwords, lemmatize, and stem

**warning**
- takes up considerable time to execute

In [5]:
import re
import nltk
import contractions
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Download required NLTK data if not already present
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def clean_text(text):
    # Expand contractions
    text = contractions.fix(text)
    # Convert to lowercase
    text = text.lower()
    # Remove special characters (keep only letters and spaces)
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Tokenize
    words = text.split()
    # Remove stopwords, lemmatize, and stem
    words = [stemmer.stem(lemmatizer.lemmatize(word)) for word in words if word not in stop_words]
    text = ' '.join(words)
    return text

for col in ['Title', 'Content']:
    dataTrain[col] = dataTrain[col].astype(str).apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\odys_\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\odys_\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\odys_\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Just printing out the firtst 5 columns to see what happend to text

In [8]:
print(dataTrain.head())
print ("\nData cleaning completed. Data shape:", dataTrain.shape)
print (dataTrain.info())

       Id                                  Title  \
0  262120               morgan fair condit crash   
1  175132                 weigh electron guidanc   
2  218739  fumbl testimoni x men director singer   
3  253483             week recap eric hill drama   
4  224109                                 letter   

                                             Content  
0  actor comedian morgan fair condit follow new j...  
1  electron unexpectedli weak quarterli earn guid...  
2  f said press confer around like meat sex direc...  
3  mix happen tonight excit see want see happen e...  
4  presid special tribut club music icon pen lett...  

Data cleaning completed. Data shape: (47912, 3)
<class 'pandas.DataFrame'>
RangeIndex: 47912 entries, 0 to 47911
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   Id       47912 non-null  int64
 1   Title    47912 non-null  str  
 2   Content  47912 non-null  str  
dtypes: int64(1), str(2)
memory

In [7]:
import joblib
# Save the dataTrain dataframe using joblib
joblib.dump(dataTrain, 'joblibCache/dataTest_cleaned.joblib')
print('dataTrain dataframe saved to joblibCache/dataTest_cleaned.joblib')

dataTrain dataframe saved to joblibCache/dataTest_cleaned.joblib
