# Logistic Regression
Logistic Regression is a statistical method for predicting binary outcomes from data.

Examples of this are "yes" vs "no" or "young" vs "old".

These are categories that translate to probability of being a 0 or a 1.

Source: Logistic Regression

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# Read file
csv_file = "clean_movies.csv"

# Rtore .csv into dataframe
movies_df = pd.read_csv(csv_file)
movies_df.head()

In [None]:
# Remove title since it is a string
movies_df.drop(['title', 'ratingInteger'], axis=1)

In [None]:
# Set features, which will also be used as x values
selected_features = movies_df[[ "performance", "year", "lifetime_gross", "ratingCount", "duration", "nrOfWins", "nrOfNominations", 
                  "nrOfPhotos", "nrOfNewsArticles", "nrOfUserReviews", "Action", "Adult", "Adventure", "Animation",
                  "Biography", "Comedy", "Crime", "Documentary", "Drama", "Family", "Fantasy", "Horror", "Music", "Musical", "Mystery",
                  "News", "RealityTV", "Romance", "SciFi", "Short", "Sport", "TalkShow", "Thriller", "War", "Western"]]

In [None]:
selected_features.head()

# Create a Train Test Split

In [None]:
X = selected_features.drop("performance", axis=1)
y = selected_features["performance"]
print(X.shape, y.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [None]:
X_train.head()

# Pre-Processing

In [None]:
# Scale your data
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

print(encoded_y_train)

# Train The Model

In [None]:
# Create a Logistic Regression Model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

In [None]:
# Fit or train model using the training data
classifier.fit(X_train_scaled, encoded_y_train)

In [None]:
print(f"Training Data Score: {classifier.score(X_train_scaled, encoded_y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, encoded_y_test)}")

In [None]:
predictions = classifier.predict(X_test)
print(f"First 25 Predictions:   {predictions[:25]}")
print(f"First 25 Actual labels: {y_test[:25].tolist()}")

In [None]:
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)