<a href="https://colab.research.google.com/github/namita-ach/Emotion-Recognition-CNN/blob/main/ML/efficiency_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np
import pandas as pd
from sklearn import svm #used for PREDICTION
from sklearn.metrics import accuracy_score #accuracy of predicted vs true labels
from sklearn.preprocessing import StandardScaler #standardizes dataset
from sklearn.model_selection import train_test_split #name says it all

In [5]:
#reading our csv and assgning it to a variable
'''
For topic: 1- photosynthesis, 2- mitosis, 3- meiosis
For communication: 1- hands on activity, 2- lecture, 3- discussion, 4- video summaries
'''
data = pd.read_csv("dataset.csv")
data

Unnamed: 0,Topic,Mode of communication,Effectiveness
0,1,1,4
1,3,3,5
2,2,4,3
3,2,3,4
4,3,2,2
...,...,...,...
2494,1,4,2
2495,1,4,4
2496,2,1,3
2497,2,3,5


In [6]:
#to give us a brief overview of what we're working with (top 5 rows)
data.head()

Unnamed: 0,Topic,Mode of communication,Effectiveness
0,1,1,4
1,3,3,5
2,2,4,3
3,2,3,4
4,3,2,2


In [7]:
#gives us info about the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2499 entries, 0 to 2498
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   Topic                  2499 non-null   int64
 1   Mode of communication  2499 non-null   int64
 2   Effectiveness          2499 non-null   int64
dtypes: int64(3)
memory usage: 58.7 KB


In [8]:
#tells us the number of rows and columns we're working with
data.shape

(2499, 3)

In [9]:
#we do this to find the number of null values, since their sum is 0, we have a relatively clean dataset
data.isnull().sum()

Topic                    0
Mode of communication    0
Effectiveness            0
dtype: int64

In [10]:
#our target column/variable will be effectiveness
data['Effectiveness'].value_counts()

3    518
2    514
5    495
1    489
4    483
Name: Effectiveness, dtype: int64

In [11]:
#effectiveness is our dependent variable
y = data['Effectiveness']
y

0       4
1       5
2       3
3       4
4       2
       ..
2494    2
2495    4
2496    3
2497    5
2498    4
Name: Effectiveness, Length: 2499, dtype: int64

In [12]:
#we're dropping status column because they aren't necessary data for prediction
x = data.drop(columns = ['Effectiveness'], axis = 1)
x

Unnamed: 0,Topic,Mode of communication
0,1,1
1,3,3
2,2,4
3,2,3
4,3,2
...,...,...
2494,1,4
2495,1,4
2496,2,1
2497,2,3


In [13]:
#we're doing this to split our data into training and testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2)

In [14]:
#this tells us the rows and columns of independent variables, training and test
print(x.shape, x_train.shape, x_test.shape)

(2499, 2) (1999, 2) (500, 2)


In [15]:
#data standardization
stanscal = StandardScaler()

In [16]:
stanscal.fit(x_train)

In [17]:
#we're making our standardized values the x training and testing data
x_train = stanscal.transform(x_train)
x_test = stanscal.transform(x_test)

In [18]:
x_train

array([[-1.22569712,  0.45210204],
       [ 1.23554702, -1.33927949],
       [ 0.00492495,  1.34779281],
       ...,
       [-1.22569712,  0.45210204],
       [ 1.23554702,  0.45210204],
       [ 0.00492495, -1.33927949]])

In [19]:
x_test

array([[ 0.00492495,  1.34779281],
       [-1.22569712,  1.34779281],
       [ 0.00492495,  0.45210204],
       [ 1.23554702, -0.44358872],
       [-1.22569712, -1.33927949],
       [ 1.23554702,  1.34779281],
       [ 1.23554702, -1.33927949],
       [-1.22569712, -0.44358872],
       [ 0.00492495, -1.33927949],
       [-1.22569712, -0.44358872],
       [-1.22569712,  1.34779281],
       [-1.22569712, -0.44358872],
       [-1.22569712,  1.34779281],
       [-1.22569712, -0.44358872],
       [ 1.23554702,  0.45210204],
       [ 0.00492495, -0.44358872],
       [ 0.00492495,  0.45210204],
       [-1.22569712,  0.45210204],
       [ 1.23554702,  0.45210204],
       [-1.22569712,  1.34779281],
       [-1.22569712, -0.44358872],
       [ 1.23554702,  1.34779281],
       [ 0.00492495,  1.34779281],
       [-1.22569712,  0.45210204],
       [ 0.00492495, -1.33927949],
       [-1.22569712,  0.45210204],
       [ 1.23554702,  1.34779281],
       [ 1.23554702,  0.45210204],
       [ 0.00492495,

In [20]:
#we train our model using svm
model = svm.SVC(kernel = 'linear')

In [21]:
model.fit(x_train, y_train)

In [22]:
#model evaluation: tests the accuracy of our train wrt dependent y training variables
x_train_pred = model.predict(x_train)
train_data_accu = accuracy_score(y_train, x_train_pred)
train_data_accu

0.21160580290145073

In [23]:
#model evaluation: tests the accuracy of our test wrt dependent y test variables
x_test_pred = model.predict(x_test)
test_data_accu = accuracy_score(y_test, x_test_pred)
test_data_accu

0.214

In [24]:
input_data = (1, 3)
input_data_np = np.asarray(input_data) #changing input into a numpy array
input_data_re = input_data_np.reshape(1, -1)  #reshaping the array
s_data = stanscal.transform(input_data_re)  #standardizing the data (ie the reshaped input)
pred = model.predict(s_data) #prediction time
print(f"The effectiveness score if you use that method for this topic is {pred}")

The effectiveness score if you use that method for this topic is [2]


