# Logistic Regression
Logistic Regression is a statistical method for predicting binary outcomes from data.

Examples of this are "yes" vs "no" or "young" vs "old".

These are categories that translate to probability of being a 0 or a 1.

Source: Logistic Regression

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Read file
csv_file = "clean_movies.csv"

# Rtore .csv into dataframe
movies_df = pd.read_csv(csv_file)
movies_df.head()

Unnamed: 0,performance,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,...,lifetime_gross,nrOfNewsArticles,nrOfNominations,nrOfPhotos,nrOfUserReviews,nrOfWins,ratingCount,ratingInteger,title,year
0,Success,0,0,0,0,0,0,0,0,1,...,1236166,428,4,67,376,3,81007,8,METROPOLIS,1927
1,Success,0,0,0,0,0,1,0,0,1,...,19181,187,0,38,186,2,70057,9,CITY LIGHTS,1931
2,Success,0,0,0,0,0,1,0,0,1,...,163577,27,1,44,180,3,90847,9,MODERN TIMES,1936
3,Success,0,0,0,0,0,0,0,0,1,...,198676459,1263,6,143,653,10,160414,8,GONE WITH THE WIND,1939
4,Success,0,0,1,0,0,0,0,0,0,...,22342633,2363,12,126,477,6,209506,8,THE WIZARD OF OZ,1939


In [3]:
# Remove title since it is a string
movies_df.drop(['title', 'ratingInteger'], axis=1)

Unnamed: 0,performance,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,...,Western,duration,lifetime_gross,nrOfNewsArticles,nrOfNominations,nrOfPhotos,nrOfUserReviews,nrOfWins,ratingCount,year
0,Success,0,0,0,0,0,0,0,0,1,...,0,9180,1236166,428,4,67,376,3,81007,1927
1,Success,0,0,0,0,0,1,0,0,1,...,0,5220,19181,187,0,38,186,2,70057,1931
2,Success,0,0,0,0,0,1,0,0,1,...,0,5220,163577,27,1,44,180,3,90847,1936
3,Success,0,0,0,0,0,0,0,0,1,...,0,14280,198676459,1263,6,143,653,10,160414,1939
4,Success,0,0,1,0,0,0,0,0,0,...,0,6120,22342633,2363,12,126,477,6,209506,1939
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3741,Success,0,0,0,0,0,0,0,1,0,...,0,6000,24146161,756,6,62,499,27,56014,2006
3742,Fail,0,0,0,0,0,0,0,0,1,...,0,5460,2099,7,1,7,20,3,1328,2007
3743,Fail,0,0,0,0,0,0,0,0,0,...,0,5640,17609452,232,0,46,371,0,57934,2007
3744,Fail,1,0,0,0,0,0,1,0,1,...,0,7560,1435,0,0,0,13,0,1250,2005


In [4]:
# Set features, which will also be used as x values
selected_features = movies_df[[ "performance", "year", "lifetime_gross", "ratingCount", "duration", "nrOfWins", "nrOfNominations", 
                  "nrOfPhotos", "nrOfNewsArticles", "nrOfUserReviews", "Action", "Adult", "Adventure", "Animation",
                  "Biography", "Comedy", "Crime", "Documentary", "Drama", "Family", "Fantasy", "Horror", "Music", "Musical", "Mystery",
                  "News", "RealityTV", "Romance", "SciFi", "Short", "Sport", "TalkShow", "Thriller", "War", "Western"]]

In [5]:
selected_features.head()

Unnamed: 0,performance,year,lifetime_gross,ratingCount,duration,nrOfWins,nrOfNominations,nrOfPhotos,nrOfNewsArticles,nrOfUserReviews,...,News,RealityTV,Romance,SciFi,Short,Sport,TalkShow,Thriller,War,Western
0,Success,1927,1236166,81007,9180,3,4,67,428,376,...,0,0,0,1,0,0,0,0,0,0
1,Success,1931,19181,70057,5220,2,0,38,187,186,...,0,0,1,0,0,0,0,0,0,0
2,Success,1936,163577,90847,5220,3,1,44,27,180,...,0,0,0,0,0,0,0,0,0,0
3,Success,1939,198676459,160414,14280,10,6,143,1263,653,...,0,0,1,0,0,0,0,0,1,0
4,Success,1939,22342633,209506,6120,6,12,126,2363,477,...,0,0,0,0,0,0,0,0,0,0


# Create a Train Test Split

In [6]:
X = selected_features.drop("performance", axis=1)
y = selected_features["performance"]
print(X.shape, y.shape)

(3746, 34) (3746,)


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [8]:
X_train.head()

Unnamed: 0,year,lifetime_gross,ratingCount,duration,nrOfWins,nrOfNominations,nrOfPhotos,nrOfNewsArticles,nrOfUserReviews,Action,...,News,RealityTV,Romance,SciFi,Short,Sport,TalkShow,Thriller,War,Western
963,1983,5310748,2940,7080,5,12,7,57,33,0,...,0,0,0,0,0,0,0,0,0,0
1692,2006,22545080,56169,7260,2,7,59,350,633,0,...,0,0,0,0,0,0,0,0,0,0
138,2014,257760692,52708,6000,1,0,42,1651,225,0,...,0,0,0,0,0,0,0,0,0,0
3137,1996,103046663,179802,6180,7,4,75,3250,824,0,...,0,0,0,0,0,0,0,0,0,0
597,1992,24276506,14629,8400,2,4,7,33,62,0,...,0,0,0,0,0,0,0,0,0,0


# Pre-Processing

In [9]:
# Scale your data
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)

In [10]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [11]:
# Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

print(encoded_y_train)

[1 0 1 ... 0 0 1]


# Train The Model

In [12]:
# Create a Logistic Regression Model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression()

In [13]:
# Fit or train model using the training data
classifier.fit(X_train_scaled, encoded_y_train)

LogisticRegression()

In [20]:
print(f"Training Data Score: {classifier.score(X_train_scaled, encoded_y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, encoded_y_test)}")

Training Data Score: 0.882876468494126
Testing Data Score: 0.880469583778015


In [21]:
predictions = classifier.predict(X_test_scaled)
print(f"First 25 Predictions:   {predictions[:25]}")
print(f"First 25 Actual labels: {y_test[:25].tolist()}")

First 25 Predictions:   [0 1 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0]
First 25 Actual labels: ['Fail', 'Fail', 'Fail', 'Fail', 'Fail', 'Fail', 'Success', 'Fail', 'Fail', 'Success', 'Success', 'Success', 'Fail', 'Fail', 'Fail', 'Fail', 'Fail', 'Fail', 'Fail', 'Fail', 'Fail', 'Fail', 'Fail', 'Fail', 'Fail']


In [22]:
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,0,Fail
1,1,Fail
2,0,Fail
3,0,Fail
4,0,Fail
...,...,...
932,0,Fail
933,0,Fail
934,0,Fail
935,0,Fail
