In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from scipy.sparse import hstack

In [3]:
# 1. Data Loading and Merging
prompts_train = pd.read_csv('/content/drive/MyDrive/UOK/Level4/Research/Dataset/prompts_train.csv')
summaries_train = pd.read_csv('/content/drive/MyDrive/UOK/Level4/Research/Dataset/summaries_train.csv')
merged_train = summaries_train.merge(prompts_train, on='prompt_id')
summaries_test = pd.read_csv('/content/drive/MyDrive/UOK/Level4/Research/Dataset/summaries_test.csv')

In [4]:
merged_train.head()

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
1,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
2,0095993991fe,814d6b,The third wave only started as an experiment w...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
3,00c20c6ddd23,814d6b,The experimen was orginally about how even whe...,0.567975,0.969062,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
4,00d40ad10dc9,814d6b,The third wave developed so quickly due to the...,-0.910596,-0.081769,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...


In [5]:
# 2. Text Cleaning
stop_words = "english"

clean_text_simple = lambda text: re.sub(r'[^a-zA-Z\s]', '', str(text).lower())

merged_train['cleaned_text'] = merged_train['text'].apply(lambda x: clean_text_simple(x))
merged_train['cleaned_prompt'] = merged_train['prompt_text'].apply(lambda x: clean_text_simple(x))

In [6]:
merged_train.head()

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text,cleaned_text,cleaned_prompt
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,the third wave was an experimentto see how peo...,background \r\nthe third wave experiment took ...
1,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,the third wave developed rapidly because the ...,background \r\nthe third wave experiment took ...
2,0095993991fe,814d6b,The third wave only started as an experiment w...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,the third wave only started as an experiment w...,background \r\nthe third wave experiment took ...
3,00c20c6ddd23,814d6b,The experimen was orginally about how even whe...,0.567975,0.969062,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,the experimen was orginally about how even whe...,background \r\nthe third wave experiment took ...
4,00d40ad10dc9,814d6b,The third wave developed so quickly due to the...,-0.910596,-0.081769,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,the third wave developed so quickly due to the...,background \r\nthe third wave experiment took ...


In [7]:
# 3. Feature Extraction
merged_train['summary_length'] = merged_train['cleaned_text'].apply(len)
merged_train['unique_words'] = merged_train['cleaned_text'].apply(lambda x: len(set(x.split())))
merged_train['word_overlap'] = merged_train.apply(lambda row: len(set(row['cleaned_text'].split()) & set(row['cleaned_prompt'].split())), axis=1)


In [8]:
merged_train.head()

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text,cleaned_text,cleaned_prompt,summary_length,unique_words,word_overlap
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,the third wave was an experimentto see how peo...,background \r\nthe third wave experiment took ...,343,49,24
1,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,the third wave developed rapidly because the ...,background \r\nthe third wave experiment took ...,1187,124,51
2,0095993991fe,814d6b,The third wave only started as an experiment w...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,the third wave only started as an experiment w...,background \r\nthe third wave experiment took ...,336,46,31
3,00c20c6ddd23,814d6b,The experimen was orginally about how even whe...,0.567975,0.969062,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,the experimen was orginally about how even whe...,background \r\nthe third wave experiment took ...,441,53,39
4,00d40ad10dc9,814d6b,The third wave developed so quickly due to the...,-0.910596,-0.081769,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,the third wave developed so quickly due to the...,background \r\nthe third wave experiment took ...,143,24,16


In [9]:
# 4. Text Vectorization using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words=stop_words)
tfidf_features_sparse = tfidf_vectorizer.fit_transform(merged_train['cleaned_text'])
X_sparse = hstack([tfidf_features_sparse, merged_train[['summary_length', 'unique_words', 'word_overlap']].values])


In [13]:
print(X_sparse)

  (0, 315)	0.21632207125700778
  (0, 350)	0.1749763236201714
  (0, 849)	0.24456610634820192
  (0, 659)	0.16751557625009758
  (0, 389)	0.1903924776244514
  (0, 331)	0.24250265845942182
  (0, 841)	0.22254129740608347
  (0, 757)	0.18454978311572715
  (0, 313)	0.1780791494199103
  (0, 863)	0.13854416940567246
  (0, 896)	0.14158123307602472
  (0, 925)	0.22089867181911674
  (0, 962)	0.18387309200117333
  (0, 352)	0.128287449833357
  (0, 472)	0.4596708355860556
  (0, 570)	0.3843848425628618
  (0, 613)	0.3280334019458479
  (0, 969)	0.13929785597405672
  (1, 588)	0.10798894738668921
  (1, 563)	0.11125400250837392
  (1, 974)	0.11811694025597858
  (1, 798)	0.13250883277427777
  (1, 287)	0.12159624208901276
  (1, 347)	0.11100249461205271
  (1, 444)	0.06960116837646241
  :	:
  (7156, 1002)	35.0
  (7157, 1000)	295.0
  (7157, 1001)	35.0
  (7157, 1002)	28.0
  (7158, 1000)	257.0
  (7158, 1001)	33.0
  (7158, 1002)	22.0
  (7159, 1000)	171.0
  (7159, 1001)	24.0
  (7159, 1002)	23.0
  (7160, 1000)	179.0
  (

In [20]:

# 5. Model Training
X_train, X_val, y_train, y_val = train_test_split(X_sparse, merged_train[['content', 'wording']], test_size=0.2, random_state=42)
lr_content = RandomForestRegressor()
lr_content.fit(X_train, y_train['content'])
lr_wording = RandomForestRegressor()
lr_wording.fit(X_train, y_train['wording'])

In [8]:

# 6. Predictions on summaries_test.csv
summaries_test = pd.merge(summaries_test, prompts_train, on='prompt_id', how='left')
summaries_test['cleaned_text'] = summaries_test['text'].apply(lambda x: clean_text_simple(x))
summaries_test['cleaned_prompt'] = summaries_test['prompt_text'].apply(lambda x: clean_text_simple(x))

if summaries_test.shape[0] > 0:
    summaries_test['summary_length'] = summaries_test['cleaned_text'].apply(len)
    summaries_test['unique_words'] = summaries_test['cleaned_text'].apply(lambda x: len(set(x.split())))
    summaries_test['word_overlap'] = summaries_test.apply(lambda row: len(set(row['cleaned_text'].split()) & set(row['cleaned_prompt'].split())), axis=1)

    tfidf_test_features_sparse = tfidf_vectorizer.transform(summaries_test['cleaned_text'])
    X_test_sparse = hstack([tfidf_test_features_sparse, summaries_test[['summary_length', 'unique_words', 'word_overlap']].values])
    content_pred_test = lr_content.predict(X_test_sparse)
    wording_pred_test = lr_wording.predict(X_test_sparse)
    submission_df = pd.DataFrame({
        'student_id': summaries_test['student_id'],
        'content': content_pred_test,
        'wording': wording_pred_test
    })
    submission_df['content'] = submission_df['content'].clip(0, 1)
    submission_df['wording'] = submission_df['wording'].clip(0, 1)
    submission_df.to_csv('submission.csv', index=False)
else:
    print("No records in summaries_test after merging with prompts_train.")