# Import Data

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

In [None]:
# read file
csv_file = "clean_movies.csv"

# store .csv into dataframe
movies_df = pd.read_csv(csv_file)
movies_df.head()

In [None]:
y = movies_df['performance']
X = movies_df.drop(['title','performance', 'ratingInteger'], axis = 1)

# Scale and Transform Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2)

In [None]:
X_scaler = MinMaxScaler().fit(X_train)

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)
# y_train_categorical = to_categorical(encoded_y_train)
# y_test_categorical = to_categorical(encoded_y_train)
print(encoded_y_train)

# K Nearest Neighbors

In [None]:
train_scores = []
test_scores = []
ktesting = 40
for k in range(1, ktesting, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, encoded_y_train)
    train_score = knn.score(X_train_scaled, encoded_y_train)
    test_score = knn.score(X_test_scaled, encoded_y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/TestScore: {train_score: .3f}/{test_score:.3f}")

In [None]:
plt.plot(range(1, ktesting, 2), train_scores, marker = 'o')
plt.plot(range(1, ktesting, 2), test_scores, marker = 'x')
plt.xlabel('k neighbors')
plt.ylabel('testing accuracy score')
plt.savefig("Images/KNN")
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors = 19)
knn.fit(X_train_scaled, encoded_y_train)
print('k=19 Train Accuracy: %.3f'% knn.score(X_train_scaled, encoded_y_train))
print('k=19 Test Accuracy: %.3f'% knn.score(X_test_scaled, encoded_y_test))

In [None]:
predictions = knn.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": encoded_y_test}).reset_index(drop=True)