In [1]:
import os
import pandas as pd

# Directory paths for positive and negative comments
positive_dir = r'C:\Users\macit\Documents\Visual_Studio_Projects\LIE-Thomas3-DA\NLP\03-Project\aclImdb\train_created\pos'
negative_dir = r'C:\Users\macit\Documents\Visual_Studio_Projects\LIE-Thomas3-DA\NLP\03-Project\aclImdb\train_created\neg'

# Function to read comments from a directory
def read_comments_from_directory(directory, label):
    comments = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                comment = file.read()
                comments.append({'comment': comment, 'label': label})
    return comments

# Read positive and negative comments
positive_comments = read_comments_from_directory(positive_dir, label='positive')
negative_comments = read_comments_from_directory(negative_dir, label='negative')

# Combine positive and negative comments into a single DataFrame
comments_df = pd.DataFrame(positive_comments + negative_comments)

# Shuffle the DataFrame
comments_df = comments_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the first few rows of the combined dataset
print(comments_df.head())

# Optionally, save the combined dataset to a CSV file
comments_df.to_csv('combined_comments.csv', index=False)


                                             comment     label
0  Ok, first the good: Cher's performance and the...  negative
1  This movie has Wild Bill Hickok, Calamity Jane...  positive
2  An actress making a movie in Africa is kidnapp...  negative
3  All the talent Mr. Sooraj Barjatya showed in h...  positive
4  Sexo Cannibal, or Devil Hunter as it's more co...  negative


In [49]:
comments_df

Unnamed: 0,comment,label
0,"Ok, first the good: Cher's performance and the...",negative
1,"This movie has Wild Bill Hickok, Calamity Jane...",positive
2,An actress making a movie in Africa is kidnapp...,negative
3,All the talent Mr. Sooraj Barjatya showed in h...,positive
4,"Sexo Cannibal, or Devil Hunter as it's more co...",negative
...,...,...
1997,I am not so much like Love Sick as I image. Fi...,negative
1998,what ends up killing this movie is its self-co...,negative
1999,This movie just pulls you so deeply into the t...,positive
2000,I was one of quite a few extras in this big bo...,negative


In [50]:
import re
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources (if not already downloaded)
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Function for text preprocessing
def preprocess_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove '<br /><br />' pattern
    text = re.sub(r'<br\s?/>', ' ', text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join tokens back into text
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# Apply preprocessing to the 'comment' column
comments_df['preprocessed_comment'] = comments_df['comment'].apply(preprocess_text)

# Display the first few rows of the updated preprocessed dataset
print(comments_df[['comment', 'preprocessed_comment', 'label']].head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\macit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\macit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                             comment  \
0  Ok, first the good: Cher's performance and the...   
1  This movie has Wild Bill Hickok, Calamity Jane...   
2  An actress making a movie in Africa is kidnapp...   
3  All the talent Mr. Sooraj Barjatya showed in h...   
4  Sexo Cannibal, or Devil Hunter as it's more co...   

                                preprocessed_comment     label  
0  Ok first good Chers performance cinematography...  negative  
1  This movie Wild Bill Hickok Calamity Jane Buff...  positive  
2  An actress making movie Africa kidnapped taken...  negative  
3  All talent Mr Sooraj Barjatya showed first mov...  positive  
4  Sexo Cannibal Devil Hunter commonly known amon...  negative  


In [51]:
preprocessed_df = comments_df[['preprocessed_comment', 'label']].copy()

preprocessed_df

Unnamed: 0,preprocessed_comment,label
0,Ok first good Chers performance cinematography...,negative
1,This movie Wild Bill Hickok Calamity Jane Buff...,positive
2,An actress making movie Africa kidnapped taken...,negative
3,All talent Mr Sooraj Barjatya showed first mov...,positive
4,Sexo Cannibal Devil Hunter commonly known amon...,negative
...,...,...
1997,I much like Love Sick I image Finally film exp...,negative
1998,end killing movie selfconsciousness among thin...,negative
1999,This movie pull deeply two main character I po...,positive
2000,I one quite extra big bomb I happened right pl...,negative


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense

# Assuming 'comments_df' has 'comment' and 'label' columns

# Step 1: Split the data into training and testing sets
train_df, test_df = train_test_split(comments_df, test_size=0.2, random_state=42)

# Step 2: Preprocess the text data
def preprocess_text(text):
    # Your preprocessing steps here (e.g., HTML tag removal, special character removal, tokenization, etc.)
    return text

train_df['preprocessed_comment'] = train_df['comment'].apply(preprocess_text)
test_df['preprocessed_comment'] = test_df['comment'].apply(preprocess_text)

# Step 3: Encode the labels
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_df['label'])
test_labels_encoded = label_encoder.transform(test_df['label'])

# Step 4: Tokenize and pad sequences
max_words = 100  # Choose an appropriate value
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df['preprocessed_comment'])

X_train_seq = tokenizer.texts_to_sequences(train_df['preprocessed_comment'])
X_test_seq = tokenizer.texts_to_sequences(test_df['preprocessed_comment'])

max_length = 100  # Choose an appropriate value
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

# Step 5: Build the model
vocab_size = len(tokenizer.word_index) + 1

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=32, input_length=max_length))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Step 6: Fit the model
model.fit(X_train_padded, train_labels_encoded, validation_data=(X_test_padded, test_labels_encoded), epochs=2, batch_size=128, verbose=2)

# Step 7: Evaluate the model
scores = model.evaluate(X_test_padded, test_labels_encoded, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))





Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 32)           717760    
                                                                 
 flatten (Flatten)           (None, 3200)              0         
                                                                 
 dense (Dense)               (None, 250)               800250    
                                                                 
 dense_1 (Dense)             (None, 1)                 251       
                                                                 
Total params: 1518261 (5.79 MB)
Trainable params: 1518261 (5.79 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/2


13/13 - 3s - loss: 0.6936 - accuracy: 0.5041 - val_loss: 0.6886 - val_accuracy: 0.5362 - 3s/epoch - 258ms/step
Epoch 2/2
13/13 - 1s - l

In [5]:
train_labels_encoded


array([1, 0, 1, ..., 0, 1, 0])