In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
df = pd.read_csv("ielts_dataset_original.csv")

In [3]:
df

Unnamed: 0,Task_Type,Question,Essay,Examiner_Commen,Task_Response,Coherence_Cohesion,Lexical_Resource,Range_Accuracy,Overall
0,1,The bar chart below describes some changes abo...,"Between 1995 and 2010, a study was conducted r...",,,,,,5.5
1,2,Rich countries often give money to poorer coun...,Poverty represents a worldwide crisis. It is t...,,,,,,6.5
2,1,The bar chart below describes some changes abo...,The left chart shows the population change hap...,,,,,,5.0
3,2,Rich countries often give money to poorer coun...,Human beings are facing many challenges nowada...,,,,,,5.5
4,1,The graph below shows the number of overseas v...,Information about the thousands of visits from...,,,,,,7.0
...,...,...,...,...,...,...,...,...,...
1430,2,Without capital punishment our lives are less ...,Serious crimes need capital punishment so that...,"You should rewrite it, giving more thought to ...",,,,,5.0
1431,2,Some people think that they can learn better b...,It is certainly said that learning is an ongoi...,Where are the paragraphs in this essay? You mu...,,,,,5.0
1432,2,Nowadays people like to change their day by da...,popular hobbies rather than their individual a...,I suggest that you read several essays that ha...,,,,,5.0
1433,2,Universities should allocate the same amount o...,"Yes, I do feel that universities should have a...","Your essay too short, the introduction is good...",,,,,5.0


In [4]:
df.isnull().sum()

Task_Type                0
Question                 0
Essay                    0
Examiner_Commen       1373
Task_Response         1435
Coherence_Cohesion    1435
Lexical_Resource      1435
Range_Accuracy        1435
Overall                  0
dtype: int64

In [5]:
work_df = df.iloc[:,[0,1, 2, 8]]

In [6]:
work_df

Unnamed: 0,Task_Type,Question,Essay,Overall
0,1,The bar chart below describes some changes abo...,"Between 1995 and 2010, a study was conducted r...",5.5
1,2,Rich countries often give money to poorer coun...,Poverty represents a worldwide crisis. It is t...,6.5
2,1,The bar chart below describes some changes abo...,The left chart shows the population change hap...,5.0
3,2,Rich countries often give money to poorer coun...,Human beings are facing many challenges nowada...,5.5
4,1,The graph below shows the number of overseas v...,Information about the thousands of visits from...,7.0
...,...,...,...,...
1430,2,Without capital punishment our lives are less ...,Serious crimes need capital punishment so that...,5.0
1431,2,Some people think that they can learn better b...,It is certainly said that learning is an ongoi...,5.0
1432,2,Nowadays people like to change their day by da...,popular hobbies rather than their individual a...,5.0
1433,2,Universities should allocate the same amount o...,"Yes, I do feel that universities should have a...",5.0


In [7]:
x = work_df[['Task_Type', 'Question', 'Essay']]
y = work_df[['Overall']]

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [9]:
vectorizer = TfidfVectorizer(max_features=2000)  # Adjust max_features as needed
x_train_tfidf = vectorizer.fit_transform(x_train['Question'] + ' ' + x_train['Essay'])
x_test_tfidf = vectorizer.transform(x_test['Question'] + ' ' + x_test['Essay'])

In [10]:
model = LinearRegression()
model.fit(x_train_tfidf, y_train)


LinearRegression()

In [11]:
y_pred = model.predict(x_test_tfidf)

# Evaluate the model's performance
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

In [12]:
y_pred.shape

(431, 1)

In [13]:
y_test.shape

(431, 1)

In [17]:
compare = y_test.copy()

In [18]:
compare['Predictions'] = y_pred

In [20]:
compare['Overall - Predictions'] = compare['Overall'] - compare['Predictions']
compare

Unnamed: 0,Overall,Predictions,Overall - Predictions
752,6.5,6.584271,-0.084271
857,7.5,6.909608,0.590392
629,7.0,7.755630,-0.755630
1411,6.0,6.140227,-0.140227
974,5.5,8.698177,-3.198177
...,...,...,...
1055,6.5,5.645055,0.854945
156,8.0,4.946964,3.053036
571,6.5,8.258908,-1.758908
594,7.0,5.906636,1.093364


In [21]:
print(mae)

0.8935732725543789


In [22]:
print(rmse)

1.2106901157952714
