# Logistic Regression
Logistic Regression is a statistical method for predicting binary outcomes from data.

Examples of this are "yes" vs "no" or "young" vs "old".

These are categories that translate to probability of being a 0 or a 1.

Source: Logistic Regression

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# read file
csv_file = "clean_movies.csv"

# store .csv into dataframe
movies_df = pd.read_csv(csv_file)
movies_df.head()

Unnamed: 0,title,year,lifetime_gross,ratingInteger,ratingCount,duration,nrOfWins,nrOfNominations,nrOfPhotos,nrOfNewsArticles,...,RealityTV,Romance,SciFi,Short,Sport,TalkShow,Thriller,War,Western,performance
0,METROPOLIS,1927,1236166,8,81007,9180,3,4,67,428,...,0,0,1,0,0,0,0,0,0,1
1,CITY LIGHTS,1931,19181,9,70057,5220,2,0,38,187,...,0,1,0,0,0,0,0,0,0,1
2,MODERN TIMES,1936,163577,9,90847,5220,3,1,44,27,...,0,0,0,0,0,0,0,0,0,1
3,GONE WITH THE WIND,1939,198676459,8,160414,14280,10,6,143,1263,...,0,1,0,0,0,0,0,1,0,1
4,THE WIZARD OF OZ,1939,22342633,8,209506,6120,6,12,126,2363,...,0,0,0,0,0,0,0,0,0,1


In [25]:
# Remove title 

movies_df.drop(['title'], axis=1)

Unnamed: 0,year,lifetime_gross,ratingInteger,ratingCount,duration,nrOfWins,nrOfNominations,nrOfPhotos,nrOfNewsArticles,nrOfUserReviews,...,RealityTV,Romance,SciFi,Short,Sport,TalkShow,Thriller,War,Western,performance
0,1927,1236166,8,81007,9180,3,4,67,428,376,...,0,0,1,0,0,0,0,0,0,1
1,1931,19181,9,70057,5220,2,0,38,187,186,...,0,1,0,0,0,0,0,0,0,1
2,1936,163577,9,90847,5220,3,1,44,27,180,...,0,0,0,0,0,0,0,0,0,1
3,1939,198676459,8,160414,14280,10,6,143,1263,653,...,0,1,0,0,0,0,0,1,0,1
4,1939,22342633,8,209506,6120,6,12,126,2363,477,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3741,2006,24146161,8,56014,6000,27,6,62,756,499,...,0,0,0,0,0,0,0,0,0,1
3742,2007,2099,5,1328,5460,3,1,7,7,20,...,0,0,0,0,0,0,0,0,0,1
3743,2007,17609452,5,57934,5640,0,0,46,232,371,...,0,0,0,0,0,0,0,0,0,1
3744,2005,1435,7,1250,7560,0,0,0,0,13,...,0,0,0,0,0,0,0,0,0,1


In [26]:
# Set features. This will also be used as your x values.
selected_features = movies_df[[ "year", "lifetime_gross", "ratingInteger", "ratingCount", "duration", "nrOfWins", "nrOfNominations", 
                  "nrOfPhotos", "nrOfNewsArticles", "nrOfUserReviews", "Action", "Adult", "Adventure", "Animation",
                  "Biography", "Comedy", "Crime", "Documentary", "Drama", "Family", "Fantasy", "Horror", "Music", "Musical", "Mystery",
                  "News", "RealityTV", "Romance", "SciFi", "Short", "Sport", "TalkShow", "Thriller", "War", "Western"]]

In [27]:
selected_features.head()

Unnamed: 0,year,lifetime_gross,ratingInteger,ratingCount,duration,nrOfWins,nrOfNominations,nrOfPhotos,nrOfNewsArticles,nrOfUserReviews,...,News,RealityTV,Romance,SciFi,Short,Sport,TalkShow,Thriller,War,Western
0,1927,1236166,8,81007,9180,3,4,67,428,376,...,0,0,0,1,0,0,0,0,0,0
1,1931,19181,9,70057,5220,2,0,38,187,186,...,0,0,1,0,0,0,0,0,0,0
2,1936,163577,9,90847,5220,3,1,44,27,180,...,0,0,0,0,0,0,0,0,0,0
3,1939,198676459,8,160414,14280,10,6,143,1263,653,...,0,0,1,0,0,0,0,0,1,0
4,1939,22342633,8,209506,6120,6,12,126,2363,477,...,0,0,0,0,0,0,0,0,0,0


In [28]:
X = selected_features.drop("ratingInteger", axis=1)
y = selected_features["ratingInteger"]
print(X.shape, y.shape)

(3746, 34) (3746,)


In [29]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [30]:
X_train.head()

Unnamed: 0,year,lifetime_gross,ratingCount,duration,nrOfWins,nrOfNominations,nrOfPhotos,nrOfNewsArticles,nrOfUserReviews,Action,...,News,RealityTV,Romance,SciFi,Short,Sport,TalkShow,Thriller,War,Western
579,2012,37400127,129924,6960,1,4,61,1599,248,0,...,0,0,0,0,0,0,0,0,0,0
632,1989,95860116,204703,7680,14,17,32,276,425,0,...,0,0,0,0,0,0,0,0,0,0
155,2004,67303450,58431,8160,1,2,28,160,337,1,...,0,0,0,0,0,0,0,0,0,1
712,2010,90759676,167079,6960,0,0,154,3391,438,1,...,0,0,0,0,0,0,0,0,0,0
255,1973,115000000,52962,6600,8,6,57,392,222,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)

In [32]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [33]:
# Create a Logistic Regression Model

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression()

In [34]:
# Fit or train model using the training data

classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [35]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.3638305446778213
Testing Data Score: 0.36926360725720386


In [36]:
predictions = classifier.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [6 7 7 7 6 7 8 7 7 8]
First 10 Actual labels: [5, 7, 7, 6, 7, 6, 8, 6, 7, 8]


In [37]:
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,6,5
1,7,7
2,7,7
3,7,6
4,6,7
...,...,...
932,6,6
933,7,6
934,6,6
935,7,6
