**goal is to predict current offered course's faculty initial based on previous semesters data**

# Data Collection & Preprocessing

In [3]:
# import necessary libraries
import pandas as pd
import re
from tabula import read_pdf
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.metrics import accuracy_score

**1)first of all reading data from a file and deducting unnecessary columns from the dataset. 2)Then splitting a particular column into two new columns by checking some conditions & applying a litte bit of regex and then adding these two new columns with existing dataset. 3)now preprocessing data by removing certain rows from dataset based on certain conditions. 4)finally saving the dataset in csv format to reuse in near future**

In [111]:
# reading data from a csv file
df = pd.read_csv('fall19.csv')

# removing unnecessary columns
daytime = df.iloc[:, 3].tolist()
df = df.iloc[:, [0, 1, 2]]

# splitting one columns into two columns
days = []
times = []
for i in range(len(daytime)):
    split = re.split(' ', daytime[i])
    
    # checking conditin
    if(len(split[0])>2):
        days.append('TBA')
    else:
        days.append(split[0])
    
    # applying regex
    joining = ''.join(split[1:])
    times.append(re.sub(':|-| ', '', joining))

# adding new columns into existing dataset
df['Day'] = days
df['Time'] = times

# removing those rows which are containing irrelevant data 
Faculty = df['Faculty'] == 'TBA'
Day = df['Day'] == 'TBA'
Time = df['Time'] == 'TBA'
df = df[~(Faculty | Day | Time)]

# saving dataset into csv format
df.to_csv('193new.csv', index = False)

In [112]:
# reading data from a pdf file and remaining operations are almost same as before
df = read_pdf('192summer19.pdf', multiple_tables=True, pages="all")

dfs = []
for i in range(len(df)):
    if i == 0:
        temp = df[i]
        dfs.append(temp)
    else:
        temp = df[i]
        temp = temp.iloc[1:, :]
        dfs.append(temp)
        
df = pd.concat(dfs)

time_col = df.iloc[:, 4].tolist()
df = df.iloc[:, [1, 2, 3, 5, 6]]

days = []
times = []
for i in range(len(time_col)):
    if(i == 0):
        days.append('Day')
        times.append('Time')
    else:  
        split = re.split(' ', time_col[i])
        if(len(split[0])>2):
            days.append('TBA')
        else:
            days.append(split[0])

        joining = ''.join(split[1:])
        times.append(re.sub(':|-| |\r', '', joining))
        
df['Day'] = days
df['Time'] = times

df = df.iloc[:, [0, 1, 2, 5, 6]]

Faculty = df[2] == 'TBA'
Day = df['Day'] == 'TBA'
Time = df['Time'] == 'TBA'
df = df[~(Faculty | Day | Time)]

df.to_csv('192new.csv', header = False, index = False)

In [113]:
# reading another pdf file and storing necessary columns and data in a csv file
df = read_pdf('192summer19.pdf', multiple_tables=True, pages="all")

dfs = []
for i in range(len(df)):
    if i == 0:
        temp = df[i]
        dfs.append(temp)
    else:
        temp = df[i]
        temp = temp.iloc[1:, :]
        dfs.append(temp)
        
df = pd.concat(dfs)

time_col = df.iloc[:, 4].tolist()
df = df.iloc[:, [1, 2, 3, 5, 6]]

days = []
times = []
for i in range(len(time_col)):
    if(i == 0):
        days.append('Day')
        times.append('Time')
    else:  
        split = re.split(' ', time_col[i])
        if(len(split[0])>2):
            days.append('TBA')
        else:
            days.append(split[0])

        joining = ''.join(split[1:])
        times.append(re.sub(':|-| |\r', '', joining))
        
df['Day'] = days
df['Time'] = times

df = df.iloc[:, [0, 1, 2, 5, 6]]

Faculty = df[2] == 'TBA'
Day = df['Day'] == 'TBA'
Time = df['Time'] == 'TBA'
df = df[~(Faculty | Day | Time)]

df.to_csv('183new.csv', header = False, index = False)

In [119]:
# reading another pdf file and storing necessary columns and data in a csv file
df = read_pdf('182summer18.pdf', multiple_tables=True, pages="all")

dfs = []
for i in range(len(df)):
    if i == 0:
        temp = df[i]
        dfs.append(temp)
    else:
        temp = df[i]
        temp = temp.iloc[1:, :]
        dfs.append(temp)
        
df = pd.concat(dfs)

df = df.iloc[2:, 1:]
df = df.iloc[:, :4]

time_col = df.iloc[:, 3].tolist()
df = df.iloc[:, [0, 1, 2]]

days = []
times = []
for i in range(len(time_col)):
    if(i == 0):
        days.append('Day')
        times.append('Time')
    else:  
        split = re.split(' ', str(time_col[i]))
        if(len(split[0])>2):
            days.append('TBA')
        else:
            days.append(split[0])

        joining = ''.join(split[1:])
        times.append(re.sub(':|-| |\r', '', joining))
        
df['Day'] = days
df['Time'] = times

Faculty = df[3] == 'TBA | nan | NAN | NaN'
Day = df['Day'] == 'TBA | nan | NAN | NaN'
Time = df['Time'] == 'TBA | nan | NAN | NaN'
df = df[~(Faculty | Day | Time)]

df1 = df.iloc[1:, :]

x = list(df1['Time'])
time = []
for i in range(len(x)):
    if(len(x[i])<12 or len(x[i])>12):
        time.append('TBA')
    else:
        time.append(x[i])
        
df1 = df1.iloc[:, :-1]
df1['Time'] = time
df1.head()

df1.rename(columns={1: 'Course', 2: 'Section', 3 : 'Faculty'}, inplace=True)

Time = df1['Time'] == 'TBA'
df1 = df1[~(Time)]

df1.to_csv('182new.csv', index = False)

# Prediction

In [4]:
# loading datasets
df182 = pd.read_csv('182new.csv')
df183 = pd.read_csv('183new.csv')
df192 = pd.read_csv('192new.csv')
df193 = pd.read_csv('193new.csv')

# splitting data into groups based on the columns that contain course initial
df182groupby = df182.groupby('Course')
df183groupby = df183.groupby('Course')
df192groupby = df192.groupby('Course')
df193groupby = df193.groupby('Course')

# constructing DataFrames from group with provided course name.
course_name = 'CSE115'
df182getgroup = df182groupby.get_group(course_name)
df183getgroup = df183groupby.get_group(course_name)
df192getgroup = df192groupby.get_group(course_name)
df193getgroup = df193groupby.get_group(course_name)

# adding constructed dataframes together and splitting it into independent & dependent portions
df = pd.concat([df182getgroup, df183getgroup, df192getgroup, df193getgroup])
X = df.iloc[:, [1, 3, 4]]
y = df.iloc[:, 2]

# converting caterocial data into numerical data
label_encoder = LabelEncoder()
X['Day'] = label_encoder.fit_transform(X['Day'])
X['Time'] = label_encoder.fit_transform(X['Time'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [5]:
# converting one column's numerical data into dummy variables and removing first column of the result to avoid dummy variable trap
one_hot_encoder = OneHotEncoder(categorical_features = [-1])
X = one_hot_encoder.fit_transform(X).toarray()
X = X[:, 1:]

In [6]:
# # converting another column's numerical data into dummy variables and removing first column of the result to avoid dummy variable trap
one_hot_encoder = OneHotEncoder(categorical_features = [-1])
X = one_hot_encoder.fit_transform(X).toarray()
X = X[:, 1:]

In [7]:
# converting another column's numerical data into dummy variables and removing first column of the result to avoid dummy variable trap
one_hot_encoder = OneHotEncoder(categorical_features = [-1])
X = one_hot_encoder.fit_transform(X).toarray()
X = X[:, 1:]

In [9]:
# splitting dataset for traing and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 64)

# applyging RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10).fit(X_train, y_train)

# predicting result based on random forest classification
y_pred = classifier.predict(X_test)

# calculating accuracy score of prediction
print(f'accuracy_score : {accuracy_score(y_test, y_pred)}')

# showing original vs predicted result
pd.DataFrame(list(zip(y_test, y_pred)), columns = ['Original', 'Predicted'])

accuracy_score : 0.75


Unnamed: 0,Original,Predicted
0,RjP,RjP
1,AKR,AKR
2,Rsl,Rsl
3,RjP,NvA
4,Srb,Srb
5,Srb,Srb
6,Mfs,Mfs
7,Mfs,Mfs
8,HSM,HSM
9,IHa,SAm
