In [4]:
import numpy as np 
import pandas as pd
import string


In [5]:

# Load the data from 'test.csv' into a DataFrame
test_data = pd.read_csv('test.csv')

# Display the first few rows before removing punctuation to see the original data
print("Before removing punctuation:")
display(test_data.head())

# Remove punctuation using regex
# The following pattern will match any character that is not a word character (\w), whitespace (\s), or apostrophe (')
# You can adjust the regex pattern according to your specific requirements
test_data['comment_text'] = test_data['comment_text'].str.replace(r'[^\w\s]', '', regex=True)

# Display the first few rows after removing punctuation to verify the changes
print("\nAfter removing punctuation:")

display(test_data.head())
output_file_path1 = 'processed_test_data1.csv'
test_data.to_csv(output_file_path1, index=False)


Before removing punctuation:


Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I am running in the park



After removing punctuation:


Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then youll ...
1,0000247867823ef7,From RfC \n\n The title is fine as it is IMO
2,00013b17ad220c46,\n\n Sources \n\n Zawe Ashton on Lapland
3,00017563c3f7919a,If you have a look back at the source the info...
4,00017695ad8997eb,I am running in the park


In [6]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk import pos_tag

# Ensure that NLTK's WordNet, tokenizer, and POS tagger resources are downloaded
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Helper function to convert nltk POS to wordnet POS
def get_wordnet_pos(nltk_pos):
    if nltk_pos.startswith('J'):
        return wordnet.ADJ
    elif nltk_pos.startswith('V'):
        return wordnet.VERB
    elif nltk_pos.startswith('N'):
        return wordnet.NOUN
    elif nltk_pos.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Define a function to lemmatize text with correct POS tagging
def lemmatize_text(text):
    # Tokenize the text into words
    words = word_tokenize(text)
    # Get POS tags for the words
    pos_tags = pos_tag(words)
    # Lemmatize each word with the correct POS
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags]
    # Join the lemmatized words back into a string
    return ' '.join(lemmatized_words)

# Apply the lemmatization function to the 'comment_text' column
test_data['comment_text'] = test_data['comment_text'].astype(str)  # Ensure the column is string type
test_data['comment_text'] = test_data['comment_text'].apply(lemmatize_text)

# Display the first few rows after lemmatization to verify the changes
print("\nAfter lemmatization:")
display(test_data.head())

# Save the processed data to a new CSV file
output_file_path2 = 'processed_test_data2.csv'
test_data.to_csv(output_file_path2, index=False)



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\OUSSAMA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\OUSSAMA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\OUSSAMA\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!



After lemmatization:


Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule be more succesful then youll ...
1,0000247867823ef7,From RfC The title be fine a it be IMO
2,00013b17ad220c46,Sources Zawe Ashton on Lapland
3,00017563c3f7919a,If you have a look back at the source the info...
4,00017695ad8997eb,I be run in the park


In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download the NLTK stopwords resource if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Load English stopwords from NLTK
stop_words = set(stopwords.words('english'))

# Define a function to remove stopwords from text
def remove_stopwords(text):
    # Tokenize the text into words
    words = word_tokenize(text)
    # Filter out any words that are in the list of stopwords
    filtered_words = [word for word in words if word.lower() not in stop_words]
    # Join the filtered words back into a single string
    return ' '.join(filtered_words)

# Apply the remove_stopwords function to the 'comment_text' column
test_data['comment_text'] = test_data['comment_text'].apply(remove_stopwords)

# Display the first few rows after stopword removal to verify the changes
print("\nAfter removing stopwords:")
display(test_data.head())



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\OUSSAMA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\OUSSAMA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!



After removing stopwords:


Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule succesful youll ever whats ha...
1,0000247867823ef7,RfC title fine IMO
2,00013b17ad220c46,Sources Zawe Ashton Lapland
3,00017563c3f7919a,look back source information update correct fo...
4,00017695ad8997eb,run park


In [8]:
output_file_path3 = 'processed_test_data3.csv'
test_data.to_csv(output_file_path3, index=False)

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

# Assume 'test_data' is your DataFrame and 'comment_text' is the column with processed text
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(test_data['comment_text'])

# Retrieve the feature names (words or tokens)
feature_names = vectorizer.get_feature_names_out()
print()
# Display the feature names
print("Feature names (words or tokens):")
print(feature_names)

# Optionally, display the number of features
print("\nNumber of unique features (words or tokens):", len(feature_names))


Feature names (words or tokens):
['00' '000' '0000' ... 'ｙｏｕｒ' 'ｙｏｕｒｓｅｌｆ' '𨳒你老母个閪']

Number of unique features (words or tokens): 294386


In [21]:
from sklearn.feature_extraction.text import TfidfTransformer

# Initialize the TF-IDF transformer
tfidf_transformer = TfidfTransformer()

# Apply the TF-IDF transformation to the Bag-of-Words matrix
X_tfidf = tfidf_transformer.fit_transform(X)

tfidf_part_df = pd.DataFrame(X_tfidf[:5, :100].toarray(), columns=feature_names[:100])

# Display the DataFrame
print("Partial view of the TF-IDF matrix:")
display(tfidf_part_df)


Partial view of the TF-IDF matrix:


Unnamed: 0,00,000,0000,000000,00000000,0000000000000000000000000000,000000000000000000000000000000000fdgkja,000000000000111265005605361866087675053350036566001020343907867982125026173889636993408203125,0000001,00000010,...,00085,0009,00090,00095,0009png,001,0010,00100,00105,0011
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
