In [11]:
#preprocess step.....

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
import re



#DATASET
summaries_train_path = "C:\\Users\\nh013\\Desktop\\CommonLit - Evaluate Student Summaries\\summaries_train.csv"
summaries_test_path = "C:\\Users\\nh013\\Desktop\\CommonLit - Evaluate Student Summaries\\summaries_test.csv"
prompts_train_path = "C:\\Users\\nh013\\Desktop\\CommonLit - Evaluate Student Summaries\\prompts_train.csv"
prompts_test_path = "C:\\Users\\nh013\\Desktop\\CommonLit - Evaluate Student Summaries\\prompts_test.csv"

summaries_train_df = pd.read_csv(summaries_train_path)
summaries_test_df = pd.read_csv(summaries_test_path)
prompts_train_df = pd.read_csv(prompts_train_path)
prompts_test_df = pd.read_csv(prompts_test_path)




#'content' and 'wording' COLUMNS ARE CONVERTED TO STRING
summaries_train_df['content'] = summaries_train_df['content'].astype(str)
summaries_train_df['wording'] = summaries_train_df['wording'].astype(str)

#COMBINE TEXT DATA FROM SUMMARAISE AND PROMPTS DATAFRANE
summaries_train_df['text'] = summaries_train_df['content'] + " " + summaries_train_df['wording']
prompts_train_df['prompt_text'] = prompts_train_df['prompt_question'] + " " + prompts_train_df['prompt_title'] + " " + prompts_train_df['prompt_text']




# PREPROCESSING FUNCTION
def preprocess_text(text):
    
    # CONVERT TO LOWERCASE
    text = text.lower()
    
    #REMOVE URLS
    text = re.sub(r'http\S+', '', text)
    
    # PREMOVE SPECIAL CHERECTER AND PUNCTUATION
    text = re.sub(r'[^\w\s]', '', text)
    
    # TOKENIZATION
    tokens = word_tokenize(text)
    
    # REMOVE STOPWORDS
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    return " ".join(filtered_tokens)

def apply_lemmatization(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

def apply_stemming(tokens):
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens




# PREPROCESSING FOR  summaries_train_df
summaries_train_df['preprocessed_text'] = summaries_train_df['text'].apply(preprocess_text)
summaries_train_df['preprocessed_content'] = summaries_train_df['content'].apply(preprocess_text)
summaries_train_df['preprocessed_wording'] = summaries_train_df['wording'].apply(preprocess_text)

#  PREPROCESSING FOR prompts_train_df
prompts_train_df['preprocessed_prompt_text'] = prompts_train_df['prompt_text'].apply(preprocess_text)
prompts_train_df['preprocessed_prompt_question'] = prompts_train_df['prompt_question'].apply(preprocess_text)
prompts_train_df['preprocessed_prompt_title'] = prompts_train_df['prompt_title'].apply(preprocess_text)


# TOKENIZATION, STEMMING, AND LEMMATIZATION
summaries_train_df['tokens'] = summaries_train_df['preprocessed_text'].apply(word_tokenize)
summaries_train_df['stemmed_tokens'] = summaries_train_df['tokens'].apply(apply_stemming)
summaries_train_df['lemmatized_tokens'] = summaries_train_df['tokens'].apply(apply_lemmatization)

prompts_train_df['tokens'] = prompts_train_df['preprocessed_prompt_text'].apply(word_tokenize)
prompts_train_df['stemmed_tokens'] = prompts_train_df['tokens'].apply(apply_stemming)
prompts_train_df['lemmatized_tokens'] = prompts_train_df['tokens'].apply(apply_lemmatization)



# CONVERT 'content' and 'wording' COLUMNS TO NUMERIC
summaries_train_df['content'] = pd.to_numeric(summaries_train_df['content'], errors='coerce')
summaries_train_df['wording'] = pd.to_numeric(summaries_train_df['wording'], errors='coerce')



# DROP NAN VALUES IN  'content' and 'wording' columns
summaries_train_df = summaries_train_df.dropna(subset=['content', 'wording'])



# PART OF SPEECH TAGGING


def apply_pos_tagging(tokens):
    return pos_tag(tokens)

summaries_train_df['pos_tags'] = summaries_train_df['tokens'].apply(apply_pos_tagging)
prompts_train_df['pos_tags'] = prompts_train_df['tokens'].apply(apply_pos_tagging)



print("Preprocessed Summaries Train Data:")
print(summaries_train_df[['text','content','wording', 'preprocessed_text', 'tokens', 'stemmed_tokens', 'lemmatized_tokens', 'pos_tags']].head())

print("\nPreprocessed Prompts Train Data:")
print(prompts_train_df[['prompt_text','prompt_question','prompt_title', 'preprocessed_prompt_text', 'tokens', 'stemmed_tokens', 'lemmatized_tokens', 'pos_tags']].head())


Preprocessed Summaries Train Data:
                                    text   content   wording  \
0    0.205682506482641 0.380537638762288  0.205683  0.380538   
1   -0.548304076980462 0.506755353548534 -0.548304  0.506755   
2      3.12892846350062 4.23122555224945  3.128928  4.231226   
3  -0.210613934166593 -0.471414826967448 -0.210614 -0.471415   
4      3.27289414977436 3.21975651022738  3.272894  3.219757   

                   preprocessed_text                                tokens  \
0  0205682506482641 0380537638762288  [0205682506482641, 0380537638762288]   
1  0548304076980462 0506755353548534  [0548304076980462, 0506755353548534]   
2    312892846350062 423122555224945    [312892846350062, 423122555224945]   
3  0210613934166593 0471414826967448  [0210613934166593, 0471414826967448]   
4    327289414977436 321975651022738    [327289414977436, 321975651022738]   

                         stemmed_tokens                     lemmatized_tokens  \
0  [0205682506482641, 03805376

In [23]:
# perform tf-idf

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
import re

#DATASET
summaries_train_path = "C:\\Users\\nh013\\Desktop\\CommonLit - Evaluate Student Summaries\\summaries_train.csv"
summaries_test_path = "C:\\Users\\nh013\\Desktop\\CommonLit - Evaluate Student Summaries\\summaries_test.csv"
prompts_train_path = "C:\\Users\\nh013\\Desktop\\CommonLit - Evaluate Student Summaries\\prompts_train.csv"
prompts_test_path = "C:\\Users\\nh013\\Desktop\\CommonLit - Evaluate Student Summaries\\prompts_test.csv"

summaries_train_df = pd.read_csv(summaries_train_path)
summaries_test_df = pd.read_csv(summaries_test_path)
prompts_train_df = pd.read_csv(prompts_train_path)
prompts_test_df = pd.read_csv(prompts_test_path)




#'content' and 'wording' COLUMNS ARE CONVERTED TO STRING
summaries_train_df['content'] = summaries_train_df['content'].astype(str)
summaries_train_df['wording'] = summaries_train_df['wording'].astype(str)

#COMBINE TEXT DATA FROM SUMMARAISE AND PROMPTS DATAFRANE
summaries_train_df['text'] = summaries_train_df['content'] + " " + summaries_train_df['wording']
prompts_train_df['prompt_text'] = prompts_train_df['prompt_question'] + " " + prompts_train_df['prompt_title'] + " " + prompts_train_df['prompt_text']




# PREPROCESSING FUNCTION
def preprocess_text(text):
    
    
    # CONVERT TO LOWERCASE
    text = text.lower()
    
    #REMOVE URLS
    text = re.sub(r'http\S+', '', text)
    
    # PREMOVE SPECIAL CHERECTER AND PUNCTUATION
    text = re.sub(r'[^\w\s]', '', text)
    
    # TOKENIZATION
    tokens = word_tokenize(text)
    
    # REMOVE STOPWORDS
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    return " ".join(filtered_tokens)

def apply_lemmatization(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

def apply_stemming(tokens):
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens




# PREPROCESSING FOR  summaries_train_df
summaries_train_df['preprocessed_text'] = summaries_train_df['text'].apply(preprocess_text)
summaries_train_df['preprocessed_content'] = summaries_train_df['content'].apply(preprocess_text)
summaries_train_df['preprocessed_wording'] = summaries_train_df['wording'].apply(preprocess_text)

# PREPROCESSING FOR prompts_train_df
prompts_train_df['preprocessed_prompt_text'] = prompts_train_df['prompt_text'].apply(preprocess_text)
prompts_train_df['preprocessed_prompt_question'] = prompts_train_df['prompt_question'].apply(preprocess_text)
prompts_train_df['preprocessed_prompt_title'] = prompts_train_df['prompt_title'].apply(preprocess_text)


# TOKENIZATION, STEMMING, AND LEMMATIZATION
summaries_train_df['tokens'] = summaries_train_df['preprocessed_text'].apply(word_tokenize)
summaries_train_df['stemmed_tokens'] = summaries_train_df['tokens'].apply(apply_stemming)
summaries_train_df['lemmatized_tokens'] = summaries_train_df['tokens'].apply(apply_lemmatization)

prompts_train_df['tokens'] = prompts_train_df['preprocessed_prompt_text'].apply(word_tokenize)
prompts_train_df['stemmed_tokens'] = prompts_train_df['tokens'].apply(apply_stemming)
prompts_train_df['lemmatized_tokens'] = prompts_train_df['tokens'].apply(apply_lemmatization)

# PART OF SPEECH TAGGING
def apply_pos_tagging(tokens):
    return pos_tag(tokens)

summaries_train_df['pos_tags'] = summaries_train_df['tokens'].apply(apply_pos_tagging)
prompts_train_df['pos_tags'] = prompts_train_df['tokens'].apply(apply_pos_tagging)



# TF-IDF Vectorization
def apply_tfidf(tokens):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform([' '.join(tokens)])
    return tfidf_matrix

summaries_train_df['tfidf_matrix'] = summaries_train_df['lemmatized_tokens'].apply(apply_tfidf)
prompts_train_df['tfidf_matrix'] = prompts_train_df['lemmatized_tokens'].apply(apply_tfidf)

print("Preprocessed Summaries Train Data with TF-IDF:")
print(summaries_train_df[['text', 'preprocessed_text', 'tokens', 'stemmed_tokens', 'lemmatized_tokens', 'pos_tags', 'tfidf_matrix']].head())

print("\nPreprocessed Prompts Train Data with TF-IDF:")
print(prompts_train_df[['prompt_text', 'preprocessed_prompt_text', 'tokens', 'stemmed_tokens', 'lemmatized_tokens', 'pos_tags', 'tfidf_matrix']].head())

Preprocessed Summaries Train Data with TF-IDF:
                                    text                  preprocessed_text  \
0    0.205682506482641 0.380537638762288  0205682506482641 0380537638762288   
1   -0.548304076980462 0.506755353548534  0548304076980462 0506755353548534   
2      3.12892846350062 4.23122555224945    312892846350062 423122555224945   
3  -0.210613934166593 -0.471414826967448  0210613934166593 0471414826967448   
4      3.27289414977436 3.21975651022738    327289414977436 321975651022738   

                                 tokens                        stemmed_tokens  \
0  [0205682506482641, 0380537638762288]  [0205682506482641, 0380537638762288]   
1  [0548304076980462, 0506755353548534]  [0548304076980462, 0506755353548534]   
2    [312892846350062, 423122555224945]    [312892846350062, 423122555224945]   
3  [0210613934166593, 0471414826967448]  [0210613934166593, 0471414826967448]   
4    [327289414977436, 321975651022738]    [327289414977436, 321975651022

In [35]:

#PERFORM NAIVE BAYES MODEL

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
import re

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split



#DATASET
summaries_train_path = "C:\\Users\\nh013\\Desktop\\CommonLit - Evaluate Student Summaries\\summaries_train.csv"
summaries_test_path = "C:\\Users\\nh013\\Desktop\\CommonLit - Evaluate Student Summaries\\summaries_test.csv"
prompts_train_path = "C:\\Users\\nh013\\Desktop\\CommonLit - Evaluate Student Summaries\\prompts_train.csv"
prompts_test_path = "C:\\Users\\nh013\\Desktop\\CommonLit - Evaluate Student Summaries\\prompts_test.csv"

summaries_train_df = pd.read_csv(summaries_train_path)
summaries_test_df = pd.read_csv(summaries_test_path)
prompts_train_df = pd.read_csv(prompts_train_path)
prompts_test_df = pd.read_csv(prompts_test_path)




#'content' and 'wording' COLUMNS ARE CONVERTED TO STRING
summaries_train_df['content'] = summaries_train_df['content'].astype(str)
summaries_train_df['wording'] = summaries_train_df['wording'].astype(str)

#COMBINE TEXT DATA FROM SUMMARAISE AND PROMPTS DATAFRANE
summaries_train_df['text'] = summaries_train_df['content'] + " " + summaries_train_df['wording']
prompts_train_df['prompt_text'] = prompts_train_df['prompt_question'] + " " + prompts_train_df['prompt_title'] + " " + prompts_train_df['prompt_text']




# PREPROCESSING FUNCTION
def preprocess_text(text):
    
    
    # CONVERT TO LOWERCASE
    text = text.lower()
    
    #REMOVE URLS
    text = re.sub(r'http\S+', '', text)
    
    # PREMOVE SPECIAL CHERECTER AND PUNCTUATION
    text = re.sub(r'[^\w\s]', '', text)
    
    # TOKENIZATION
    tokens = word_tokenize(text)
    
    # REMOVE STOPWORDS
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    return " ".join(filtered_tokens)

def apply_lemmatization(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

def apply_stemming(tokens):
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens




# PREPROCESSING FOR  summaries_train_df
summaries_train_df['preprocessed_text'] = summaries_train_df['text'].apply(preprocess_text)
summaries_train_df['preprocessed_content'] = summaries_train_df['content'].apply(preprocess_text)
summaries_train_df['preprocessed_wording'] = summaries_train_df['wording'].apply(preprocess_text)

#PREPROCESSING FOR  prompts_train_df
prompts_train_df['preprocessed_prompt_text'] = prompts_train_df['prompt_text'].apply(preprocess_text)
prompts_train_df['preprocessed_prompt_question'] = prompts_train_df['prompt_question'].apply(preprocess_text)
prompts_train_df['preprocessed_prompt_title'] = prompts_train_df['prompt_title'].apply(preprocess_text)


# TOKENIZATION, STEMMING, AND LEMMATIZATION
summaries_train_df['tokens'] = summaries_train_df['preprocessed_text'].apply(word_tokenize)
summaries_train_df['stemmed_tokens'] = summaries_train_df['tokens'].apply(apply_stemming)
summaries_train_df['lemmatized_tokens'] = summaries_train_df['tokens'].apply(apply_lemmatization)

prompts_train_df['tokens'] = prompts_train_df['preprocessed_prompt_text'].apply(word_tokenize)
prompts_train_df['stemmed_tokens'] = prompts_train_df['tokens'].apply(apply_stemming)
prompts_train_df['lemmatized_tokens'] = prompts_train_df['tokens'].apply(apply_lemmatization)



# PART OF SPEECH TAGGING
def apply_pos_tagging(tokens):
    return pos_tag(tokens)

summaries_train_df['pos_tags'] = summaries_train_df['tokens'].apply(apply_pos_tagging)
prompts_train_df['pos_tags'] = prompts_train_df['tokens'].apply(apply_pos_tagging)



# TF-IDF VECTORIZATION
def apply_tfidf(tokens):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform([' '.join(tokens)])
    return tfidf_matrix

summaries_train_df['tfidf_matrix'] = summaries_train_df['lemmatized_tokens'].apply(apply_tfidf)
prompts_train_df['tfidf_matrix'] = prompts_train_df['lemmatized_tokens'].apply(apply_tfidf)





# SPLIT DATA
X = summaries_train_df['content'] + " " + summaries_train_df['preprocessed_content'] + " " + summaries_train_df['preprocessed_wording']
y = summaries_train_df['wording']  # You need to create the 'label' column with class labels


# SPLIT DATA INTO TRAINING AND TESTING SET
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



#LET'S CREATE TF-IDF VECTORE
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


# TRAIN MODEL
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# PREDICTIONS
y_pred = nb_model.predict(X_test_tfidf)

# EVALUATION
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

#PREPROESS THE TEST DATA
summaries_test_df['preprocessed_text'] = summaries_test_df['text'].apply(preprocess_text)


# CREATE TF-IDF VECTORS FOR TEST DATA 
X_test_submission = summaries_test_df['preprocessed_text']
X_test_submission_tfidf = tfidf_vectorizer.transform(X_test_submission)

# PREDICTION ON TEST DATA 
y_test_submission_pred = nb_model.predict(X_test_submission_tfidf)

# SUBMISSSION DATA FRAME 
submission_df = pd.DataFrame({
    'student_id': summaries_test_df['student_id'],
    'content': summaries_test_df['text'],
    'wording': y_test_submission_pred
})

# SAVE SUBMISSION FILE
submission_file_path = "submission.csv"
submission_df.to_csv(submission_file_path, index=False)

print("Submission file saved:", submission_file_path)


Accuracy: 0.6189811584089323
Classification Report:
                      precision    recall  f1-score   support

-0.0015290909730967       0.00      0.00      0.00         1
-0.0016108985178951       0.00      0.00      0.00         1
-0.0025719695585915       1.00      1.00      1.00         2
-0.0052242590109083       1.00      1.00      1.00         3
-0.0091077501105194       0.00      0.00      0.00         3
-0.0283739794720577       0.00      0.00      0.00         1
-0.0319873399650709       0.00      0.00      0.00         1
-0.0425161740436652       1.00      1.00      1.00        10
-0.0444775230618836       0.00      0.00      0.00         1
  -0.04543859410258       1.00      1.00      1.00        34
-0.0463996651432763       0.00      0.00      0.00         1
-0.0493220852021911       0.00      0.00      0.00         1
-0.0522445052611057       0.00      0.00      0.00         2
-0.0529354456952042       0.00      0.00      0.00         1
-0.0568189367948152       0.00 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [3]:
#perform RNNS model


import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder, OneHotEncoder



#DATASET
summaries_train_path = "C:\\Users\\nh013\\Desktop\\CommonLit - Evaluate Student Summaries\\summaries_train.csv"
summaries_test_path = "C:\\Users\\nh013\\Desktop\\CommonLit - Evaluate Student Summaries\\summaries_test.csv"
prompts_train_path = "C:\\Users\\nh013\\Desktop\\CommonLit - Evaluate Student Summaries\\prompts_train.csv"
prompts_test_path = "C:\\Users\\nh013\\Desktop\\CommonLit - Evaluate Student Summaries\\prompts_test.csv"

summaries_train_df = pd.read_csv(summaries_train_path)
summaries_test_df = pd.read_csv(summaries_test_path)
prompts_train_df = pd.read_csv(prompts_train_path)
prompts_test_df = pd.read_csv(prompts_test_path)




#'content' and 'wording' COLUMNS ARE CONVERTED TO STRING
summaries_train_df['content'] = summaries_train_df['content'].astype(str)
summaries_train_df['wording'] = summaries_train_df['wording'].astype(str)

#COMBINE TEXT DATA FROM SUMMARAISE AND PROMPTS DATAFRANE
summaries_train_df['text'] = summaries_train_df['content'] + " " + summaries_train_df['wording']
prompts_train_df['prompt_text'] = prompts_train_df['prompt_question'] + " " + prompts_train_df['prompt_title'] + " " + prompts_train_df['prompt_text']




# PREPROCESSING FUNCTION
def preprocess_text(text):
    
    
    
    # CONVERT TO LOWERCASE
    text = text.lower()
    
    #REMOVE URLS
    text = re.sub(r'http\S+', '', text)
    
    # PREMOVE SPECIAL CHERECTER AND PUNCTUATION
    text = re.sub(r'[^\w\s]', '', text)
    
    # TOKENIZATION
    tokens = word_tokenize(text)
    
    # REMOVE STOPWORDS
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    return " ".join(filtered_tokens)

def apply_lemmatization(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

def apply_stemming(tokens):
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens




# PREPROCESSING FOR  summaries_train_df
summaries_train_df['preprocessed_text'] = summaries_train_df['text'].apply(preprocess_text)
summaries_train_df['preprocessed_content'] = summaries_train_df['content'].apply(preprocess_text)
summaries_train_df['preprocessed_wording'] = summaries_train_df['wording'].apply(preprocess_text)

# PREPROCESSING FOR prompts_train_df
prompts_train_df['preprocessed_prompt_text'] = prompts_train_df['prompt_text'].apply(preprocess_text)
prompts_train_df['preprocessed_prompt_question'] = prompts_train_df['prompt_question'].apply(preprocess_text)
prompts_train_df['preprocessed_prompt_title'] = prompts_train_df['prompt_title'].apply(preprocess_text)


# TOKENIZATION, STEMMING, AND LEMMATIZATION
summaries_train_df['tokens'] = summaries_train_df['preprocessed_text'].apply(word_tokenize)
summaries_train_df['stemmed_tokens'] = summaries_train_df['tokens'].apply(apply_stemming)
summaries_train_df['lemmatized_tokens'] = summaries_train_df['tokens'].apply(apply_lemmatization)

prompts_train_df['tokens'] = prompts_train_df['preprocessed_prompt_text'].apply(word_tokenize)
prompts_train_df['stemmed_tokens'] = prompts_train_df['tokens'].apply(apply_stemming)
prompts_train_df['lemmatized_tokens'] = prompts_train_df['tokens'].apply(apply_lemmatization)


# PART OF SPEECH TAGGING
def apply_pos_tagging(tokens):
    return pos_tag(tokens)

summaries_train_df['pos_tags'] = summaries_train_df['tokens'].apply(apply_pos_tagging)
prompts_train_df['pos_tags'] = prompts_train_df['tokens'].apply(apply_pos_tagging)



# TF-IDF VECTORIZATION
def apply_tfidf(tokens):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform([' '.join(tokens)])
    return tfidf_matrix

summaries_train_df['tfidf_matrix'] = summaries_train_df['lemmatized_tokens'].apply(apply_tfidf)
prompts_train_df['tfidf_matrix'] = prompts_train_df['lemmatized_tokens'].apply(apply_tfidf)






# CONVERT 'content' and 'wording' COLUMNS TO NUMERIC
summaries_train_df['content'] = pd.to_numeric(summaries_train_df['content'], errors='coerce')
summaries_train_df['wording'] = pd.to_numeric(summaries_train_df['wording'], errors='coerce')

# DROP NAN VALUES in 'content' and 'wording' columns
summaries_train_df = summaries_train_df.dropna(subset=['content', 'wording'])


# CONCATENATE THE  numeric 'content' and 'wording' COLUMNS
summaries_train_df['text'] = summaries_train_df['content'] + summaries_train_df['wording']

# SPLIT DATA 
X = summaries_train_df['text'].astype(str) + " " + summaries_train_df['preprocessed_content'].astype(str) + " " + summaries_train_df['preprocessed_wording'].astype(str)

y = summaries_train_df['wording'].values  # Convert target variable to NumPy array for regression


# TOKENIZE THE TEXT 
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_sequences, padding='post')

# SPLIT DATA INTO TRAIN AND VALIDATION SET
X_train, X_val, y_train, y_val = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# BUILD MODEL 
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=X_padded.shape[1]))
model.add(LSTM(units=128, return_sequences=True))
model.add(LSTM(units=64))
model.add(Dense(units=1, activation='linear'))  # Linear activation for regression

model.compile(loss='mean_squared_error', optimizer='adam')

# TRAIN THE MODEL
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, batch_size=32)

# EVALUATE 
loss = model.evaluate(X_val, y_val)
print("Validation Loss:", loss)

# PREDICTIONS 
predictions = model.predict(X_val)


# LET'S CREATE INTERECTION LOOPS
def preprocess_user_input(text):
    
    # PREPROCESS USER INPUT TEXT
    preprocessed_text = preprocess_text(text)
    preprocessed_tokens = word_tokenize(preprocessed_text)
    stemmed_tokens = apply_stemming(preprocessed_tokens)
    lemmatized_tokens = apply_lemmatization(preprocessed_tokens)
    return ' '.join(lemmatized_tokens)

def assess_summary_quality(user_input):
    
    
    #PREPROCESS USER INPUT
    preprocessed_input = preprocess_user_input(user_input)
    
    # TOKENIZE AND PAD THE INPUT
    input_sequences = tokenizer.texts_to_sequences([preprocessed_input])
    padded_input = pad_sequences(input_sequences, padding='post', maxlen=X_padded.shape[1])
    
    
    # MAKE PREDICTION
    predicted_quality = model.predict(padded_input)
    
    return predicted_quality[0][0]

# INTERECTION LOOP  EXECUTE
while True:
    user_input = input("Enter a summary (type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break
    
    predicted_quality = assess_summary_quality(user_input)
    print("Predicted Summary Quality:", predicted_quality)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78