In [29]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, Concatenate
from tensorflow.keras.models import Model
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [2]:
df = pd.read_csv("ielts_dataset_original.csv")

In [3]:
work_df = df.iloc[:,[0,1, 2, 8]]

In [4]:
work_df

Unnamed: 0,Task_Type,Question,Essay,Overall
0,1,The bar chart below describes some changes abo...,"Between 1995 and 2010, a study was conducted r...",5.5
1,2,Rich countries often give money to poorer coun...,Poverty represents a worldwide crisis. It is t...,6.5
2,1,The bar chart below describes some changes abo...,The left chart shows the population change hap...,5.0
3,2,Rich countries often give money to poorer coun...,Human beings are facing many challenges nowada...,5.5
4,1,The graph below shows the number of overseas v...,Information about the thousands of visits from...,7.0
...,...,...,...,...
1430,2,Without capital punishment our lives are less ...,Serious crimes need capital punishment so that...,5.0
1431,2,Some people think that they can learn better b...,It is certainly said that learning is an ongoi...,5.0
1432,2,Nowadays people like to change their day by da...,popular hobbies rather than their individual a...,5.0
1433,2,Universities should allocate the same amount o...,"Yes, I do feel that universities should have a...",5.0


In [5]:
work_df['QnA']= work_df.iloc[:,1] + '##' + work_df.iloc[:, 2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  work_df['QnA']= work_df.iloc[:,1] + '##' + work_df.iloc[:, 2]


In [6]:
#Tokenizing text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(work_df['QnA'])
X_sequences = tokenizer.texts_to_sequences(work_df['QnA'])
X_padded = pad_sequences(X_sequences)

In [9]:
X_task_type = work_df['Task_Type'].values.reshape(-1, 1)
y = work_df['Overall']

In [10]:
X_train_text, X_val_text, X_train_task_type, X_val_task_type, y_train, y_val = train_test_split(
    X_padded, X_task_type, y, test_size=0.3, random_state=42)


In [14]:
text_input = Input(shape=(X_padded.shape[1],))
task_type_input = Input(shape=(1,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=50, input_length=X_padded.shape[1])(text_input)
lstm_layer = LSTM(128)(embedding_layer)
concatenated = Concatenate()([lstm_layer, task_type_input])
output = Dense(1, activation='linear')(concatenated)


In [15]:
model = Model(inputs=[text_input, task_type_input], outputs=output)
model.compile(loss='mean_squared_error', optimizer='adam')

In [17]:
model.fit([X_train_text, X_train_task_type], y_train, validation_data=([X_val_text, X_val_task_type], y_val), epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1c6662b84c0>

In [18]:
val_loss = model.evaluate([X_val_text, X_val_task_type], y_val)



In [31]:
y_pred= model.predict([X_val_text, X_val_task_type])
for i in range(len(y_val)):
    print(f"Actual Score: {y_val.iloc[i]}, Predicted Score: {predictions[i][0]}")

Actual Score: 6.5, Predicted Score: 6.314497470855713
Actual Score: 7.5, Predicted Score: 6.731588840484619
Actual Score: 7.0, Predicted Score: 5.950225830078125
Actual Score: 6.0, Predicted Score: 7.293788909912109
Actual Score: 5.5, Predicted Score: 7.3820414543151855
Actual Score: 6.5, Predicted Score: 8.403034210205078
Actual Score: 6.5, Predicted Score: 6.721617221832275
Actual Score: 8.0, Predicted Score: 7.422724723815918
Actual Score: 6.5, Predicted Score: 5.648571014404297
Actual Score: 6.5, Predicted Score: 6.6747918128967285
Actual Score: 6.5, Predicted Score: 6.004540920257568
Actual Score: 5.5, Predicted Score: 7.17612361907959
Actual Score: 5.5, Predicted Score: 6.4287590980529785
Actual Score: 7.0, Predicted Score: 6.825746536254883
Actual Score: 7.0, Predicted Score: 8.077839851379395
Actual Score: 8.0, Predicted Score: 6.4710869789123535
Actual Score: 5.0, Predicted Score: 6.927803993225098
Actual Score: 7.5, Predicted Score: 6.901040077209473
Actual Score: 7.0, Predic

In [32]:
mae = mean_absolute_error(y_val, y_pred)
rmse = mean_squared_error(y_val, y_pred, squared=False)

In [33]:
print(mae)

0.8236106553929587


In [34]:
print(rmse)

1.0723565156121684
