In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

# Import pandas and read the charity_data.csv from the provided cloud URL.
import pandas as pd
from pathlib import Path
path = Path("Resources/final_combined_data.csv")
df = pd.read_csv(path)
df.head()

Unnamed: 0,row_id,movie_name,rating,votes,meta_score,genre0,genre1,genre2,genre3,pr_rating,...,cast1,cast2,cast3,cast4,director,number_rating,weekly_rank,weekly_hours_viewed,weekly_views,cumulative_weeks_in_top_10
0,506,1917,8.2,654000.0,78,"Action, Drama, War",Action,Drama,War,R,...,Dean-Charles Chapman,George MacKay,Daniel Mays,Colin Firth,Sam Mendes,4,7,10840000,,1
1,1356,21 Jump Street,7.2,589000.0,69,"Action, Comedy, Crime",Action,Comedy,Crime,R,...,Jonah Hill,Channing Tatum,Ice Cube,Brie Larson,Phil LordChristopher Miller,4,6,11690000,,1
2,1698,6 Underground,6.1,192000.0,41,"Action, Thriller",Action,Thriller,,R,...,Ryan Reynolds,Mélanie Laurent,Manuel Garcia-Rulfo,Ben Hardy,Michael Bay,4,5,8730000,,3
3,1698,6 Underground,6.1,192000.0,41,"Action, Thriller",Action,Thriller,,R,...,Ryan Reynolds,Mélanie Laurent,Manuel Garcia-Rulfo,Ben Hardy,Michael Bay,4,7,7440000,,2
4,1698,6 Underground,6.1,192000.0,41,"Action, Thriller",Action,Thriller,,R,...,Ryan Reynolds,Mélanie Laurent,Manuel Garcia-Rulfo,Ben Hardy,Michael Bay,4,9,7770000,,1


In [2]:
filtered_df = df[['row_id', 'rating', 'votes', 'meta_score', 'number_rating', 'weekly_rank', 'weekly_hours_viewed', 'cumulative_weeks_in_top_10' ]]
filtered_df.head()

Unnamed: 0,row_id,rating,votes,meta_score,number_rating,weekly_rank,weekly_hours_viewed,cumulative_weeks_in_top_10
0,506,8.2,654000.0,78,4,7,10840000,1
1,1356,7.2,589000.0,69,4,6,11690000,1
2,1698,6.1,192000.0,41,4,5,8730000,3
3,1698,6.1,192000.0,41,4,7,7440000,2
4,1698,6.1,192000.0,41,4,9,7770000,1


In [3]:
# Determine the number of unique values in each column.
filtered_df.nunique()

row_id                        182
rating                         36
votes                         155
meta_score                     63
number_rating                   5
weekly_rank                    10
weekly_hours_viewed           387
cumulative_weeks_in_top_10     14
dtype: int64

In [4]:
# Look at APPLICATION_TYPE value counts to identify and replace with "Other"
filtered_df['weekly_hours_viewed'].value_counts()

weekly_hours_viewed
5400000     6
6500000     5
5900000     4
7300000     4
12290000    3
           ..
4960000     1
65920000    1
69540000    1
24700000    1
55650000    1
Name: count, Length: 387, dtype: int64

In [5]:
# Choose a cutoff value and create a list of application types to be replaced
# use the variable name `top_10_replace`
wkly_hours_viewed = filtered_df['weekly_hours_viewed'].value_counts()

cutoff = 500
wkly_hours_replace = wkly_hours_viewed[wkly_hours_viewed < cutoff].index.tolist()

# Replace in dataframe
for app in wkly_hours_replace:
    filtered_df['weekly_hours_viewed'] = filtered_df['weekly_hours_viewed'].replace(app,"Other")

# Check to make sure replacement was successful
filtered_df['weekly_hours_viewed'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['weekly_hours_viewed'] = filtered_df['weekly_hours_viewed'].replace(app,"Other")


weekly_hours_viewed
Other    483
Name: count, dtype: int64

In [6]:
# Look at weekly_rank value counts to identify and replace with "Other"
filtered_df['weekly_rank'].value_counts()

weekly_rank
9     60
1     54
7     53
10    51
6     50
5     49
3     47
4     42
8     41
2     36
Name: count, dtype: int64

In [7]:
# You may find it helpful to look at CLASSIFICATION value counts >1
wky_counts = filtered_df['weekly_rank'].value_counts()

#creating the code for the value count >1
wky_counts[wky_counts > 1]

weekly_rank
9     60
1     54
7     53
10    51
6     50
5     49
3     47
4     42
8     41
2     36
Name: count, dtype: int64

In [8]:
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `weekly_rank_replace`
wky_counts = filtered_df['weekly_rank'].value_counts()

cutoff = 500
weekly_rank_replace = wky_counts[wky_counts < cutoff].index.tolist()

# Replace in dataframe
for cls in weekly_rank_replace:
    filtered_df['weekly_rank'] = filtered_df['weekly_rank'].replace(cls,"Other")

# Check to make sure replacement was successful
filtered_df['weekly_rank'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['weekly_rank'] = filtered_df['weekly_rank'].replace(cls,"Other")


weekly_rank
Other    483
Name: count, dtype: int64

In [9]:
# Convert categorical data to numeric with `pd.get_dummies`
converted_df = pd.get_dummies(filtered_df)
converted_df.head()

Unnamed: 0,row_id,rating,votes,meta_score,number_rating,cumulative_weeks_in_top_10,weekly_rank_Other,weekly_hours_viewed_Other
0,506,8.2,654000.0,78,4,1,True,True
1,1356,7.2,589000.0,69,4,1,True,True
2,1698,6.1,192000.0,41,4,3,True,True
3,1698,6.1,192000.0,41,4,2,True,True
4,1698,6.1,192000.0,41,4,1,True,True


In [10]:
# Split our preprocessed data into our features and target arrays
X = converted_df.drop("cumulative_weeks_in_top_10", axis=1)
y = converted_df["cumulative_weeks_in_top_10"]

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [11]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.

model = tf.keras.models.Sequential()

# First hidden layer
model.add(tf.keras.layers.Dense(units=80, activation="relu", input_dim=7))

# Second hidden layer
model.add(tf.keras.layers.Dense(units=50, activation="relu"))

# Third hidden layer
# model.add(tf.keras.layers.Dense(units=70, activation="relu"))

# Output layer
model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [13]:
# Compile the model
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [14]:
# Train the model
model.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2585 - loss: 0.3919  
Epoch 2/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.3486 - loss: -1.1678
Epoch 3/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.3543 - loss: -2.8586
Epoch 4/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.3813 - loss: -5.0441 
Epoch 5/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.3741 - loss: -8.1648 
Epoch 6/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.3211 - loss: -13.5308 
Epoch 7/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.3762 - loss: -18.0277 
Epoch 8/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3690 - loss: -24.6649 
Epoch 9/100
[1m12/12[0m [32m

<keras.src.callbacks.history.History at 0x282454ac0>

In [15]:
# Evaluate the model using the test data
model_loss, model_accuracy = model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

4/4 - 0s - 17ms/step - accuracy: 0.4298 - loss: -5.5183e+04
Loss: -55183.25390625, Accuracy: 0.42975205183029175
