<a href="https://colab.research.google.com/github/namita-ach/Hallothon--Haccio/blob/main/efficiency_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [54]:
import numpy as np
import pandas as pd
from sklearn import svm #used for PREDICTION
from sklearn.metrics import accuracy_score #accuracy of predicted vs true labels
from sklearn.preprocessing import StandardScaler #standardizes dataset
from sklearn.model_selection import train_test_split #name says it all

In [55]:
#reading our csv and assgning it to a variable
'''
For topic: 1- photosynthesis, 2- mitosis, 3- meiosis
For communication: 1- hands on activity, 2- lecture, 3- discussion, 4- video summaries
'''
data = pd.read_csv("dataset.csv")
data

Unnamed: 0,Topic,Mode of communication,Effectiveness
0,2,4,4
1,1,4,5
2,2,4,3
3,3,3,4
4,1,4,2
...,...,...,...
594,2,3,5
595,2,1,4
596,2,2,4
597,3,4,3


In [56]:
#to give us a brief overview of what we're working with (top 5 rows)
data.head()

Unnamed: 0,Topic,Mode of communication,Effectiveness
0,2,4,4
1,1,4,5
2,2,4,3
3,3,3,4
4,1,4,2


In [57]:
#gives us info about the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 599 entries, 0 to 598
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   Topic                  599 non-null    int64
 1   Mode of communication  599 non-null    int64
 2   Effectiveness          599 non-null    int64
dtypes: int64(3)
memory usage: 14.2 KB


In [58]:
#tells us the number of rows and columns we're working with
data.shape

(599, 3)

In [59]:
#we do this to find the number of null values, since their sum is 0, we have a relatively clean dataset
data.isnull().sum()

Topic                    0
Mode of communication    0
Effectiveness            0
dtype: int64

In [60]:
#our target column/variable will be effectiveness
data['Effectiveness'].value_counts()

3    130
1    121
2    120
4    116
5    112
Name: Effectiveness, dtype: int64

In [61]:
#effectiveness is our dependent variable
y = data['Effectiveness']
y

0      4
1      5
2      3
3      4
4      2
      ..
594    5
595    4
596    4
597    3
598    2
Name: Effectiveness, Length: 599, dtype: int64

In [62]:
#we're dropping status column because they aren't necessary data for prediction
x = data.drop(columns = ['Effectiveness'], axis = 1)
x

Unnamed: 0,Topic,Mode of communication
0,2,4
1,1,4
2,2,4
3,3,3
4,1,4
...,...,...
594,2,3
595,2,1
596,2,2
597,3,4


In [63]:
#we're doing this to split our data into training and testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2)

In [64]:
#this tells us the rows and columns of independent variables, training and test
print(x.shape, x_train.shape, x_test.shape)

(599, 2) (479, 2) (120, 2)


In [65]:
#data standardization
stanscal = StandardScaler()

In [68]:
stanscal.fit(x_train)

In [69]:
#we're making our standardized values the x training and testing data
x_train = stanscal.transform(x_train)
x_test = stanscal.transform(x_test)

In [70]:
x_train

array([[ 1.19913667,  0.46386493],
       [-1.32010221,  1.35619948],
       [-0.06048277, -1.32080417],
       [ 1.19913667,  0.46386493],
       [-1.32010221,  0.46386493],
       [-0.06048277, -0.42846962],
       [-1.32010221,  0.46386493],
       [ 1.19913667,  1.35619948],
       [-1.32010221, -0.42846962],
       [ 1.19913667, -0.42846962],
       [-0.06048277, -0.42846962],
       [-1.32010221, -0.42846962],
       [-0.06048277, -1.32080417],
       [-0.06048277, -0.42846962],
       [-0.06048277,  1.35619948],
       [-0.06048277,  1.35619948],
       [ 1.19913667,  0.46386493],
       [-1.32010221, -1.32080417],
       [ 1.19913667,  0.46386493],
       [-0.06048277,  0.46386493],
       [-1.32010221,  1.35619948],
       [-1.32010221, -1.32080417],
       [ 1.19913667,  1.35619948],
       [-0.06048277,  1.35619948],
       [-1.32010221,  0.46386493],
       [-0.06048277, -0.42846962],
       [-0.06048277,  1.35619948],
       [-1.32010221,  0.46386493],
       [-1.32010221,

In [71]:
x_test

array([[-0.06048277, -1.32080417],
       [-1.32010221,  1.35619948],
       [-0.06048277,  1.35619948],
       [-1.32010221, -1.32080417],
       [-0.06048277, -0.42846962],
       [ 1.19913667, -1.32080417],
       [-0.06048277, -0.42846962],
       [-1.32010221, -0.42846962],
       [ 1.19913667,  1.35619948],
       [-1.32010221, -1.32080417],
       [ 1.19913667,  0.46386493],
       [-0.06048277, -0.42846962],
       [-1.32010221, -1.32080417],
       [-0.06048277, -1.32080417],
       [ 1.19913667, -1.32080417],
       [-0.06048277,  0.46386493],
       [-1.32010221,  0.46386493],
       [-0.06048277,  1.35619948],
       [-0.06048277, -0.42846962],
       [ 1.19913667,  0.46386493],
       [-1.32010221,  1.35619948],
       [-1.32010221, -1.32080417],
       [-1.32010221,  0.46386493],
       [ 1.19913667,  1.35619948],
       [ 1.19913667, -1.32080417],
       [-0.06048277, -0.42846962],
       [ 1.19913667, -0.42846962],
       [-0.06048277, -1.32080417],
       [-1.32010221,

In [72]:
#we train our model using svm
model = svm.SVC(kernel = 'linear')

In [73]:
model.fit(x_train, y_train)

In [74]:
#model evaluation: tests the accuracy of our train wrt dependent y training variables
x_train_pred = model.predict(x_train)
train_data_accu = accuracy_score(y_train, x_train_pred)
train_data_accu

0.23173277661795408

In [75]:
#model evaluation: tests the accuracy of our test wrt dependent y test variables
x_test_pred = model.predict(x_test)
test_data_accu = accuracy_score(y_test, x_test_pred)
test_data_accu

0.19166666666666668

In [76]:
input_data = (1, 3)
input_data_np = np.asarray(input_data) #changing input into a numpy array
input_data_re = input_data_np.reshape(1, -1)  #reshaping the array
s_data = stanscal.transform(input_data_re)  #standardizing the data (ie the reshaped input)
pred = model.predict(s_data) #prediction time
print(f"The effectiveness score if you use that method for this topic is {pred}")

The effectiveness score if you use that method for this topic is [2]


