In [54]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn import tree


## Data Acquisition
##### Flat Acquisition using csv files

In [55]:
movies = pd.read_csv('./data/movies.csv')
# genome_scores = pd.read_csv('./data/genome-scores.csv')
# genome_tags = pd.read_csv('./data/genome-tags.csv')
ratings = pd.read_csv('./data/ratings.csv')
tags = pd.read_csv('./data/tags.csv')

In [56]:
# print the first 5 rows of the movies, ratings, and tags dataframes
display(movies.head())
display(ratings.head())
display(tags.head())

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455


### Data Wrangling

In [57]:
grouped_ratings = ratings.groupby('movieId').agg({'rating': ['mean', 'count'], 'timestamp': ['mean']})
grouped_ratings.columns = ['mean_rating', 'rating_count', 'mean_timestamp']
grouped_ratings = grouped_ratings.astype({'mean_timestamp': 'int'})

# Merge dei dataset movies e ratings
# rimuove i film che non hanno rating
movies_ratings = pd.merge(movies, grouped_ratings, on='movieId')

# mantiene i duplicati, per rimuoverli usare .apply[set, list]
# tags.groupby('movieId')['tag'].apply(set)
new_tags = tags.groupby('movieId')['tag'].apply(set).apply(list)

df = pd.merge(movies_ratings, new_tags, on='movieId')

display(df.head())


Unnamed: 0,movieId,title,genres,mean_rating,rating_count,mean_timestamp,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.893708,57309,1153152210,"[mission, 3D, é˜®ä¸€é¸£, HEROIC MISSION, joss ..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.251527,24228,1122310117,"[bad cgi, board game, time travel, Dynamic CGI..."
2,3,Grumpier Old Men (1995),Comedy|Romance,3.142028,11804,980602256,"[Jack Lemmon, old people that is actually funn..."
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.853547,2523,942460471,"[characters, single mother, divorce, revenge, ..."
4,5,Father of the Bride Part II (1995),Comedy,3.058434,11714,1004723013,"[daughter, parent child relationship, confiden..."


### Data Preparation (Data cleaning, Duplicates filtering, Data encoding)

##### Data Cleaning
The following results show that there are no NaN values in the dataset.

In [58]:
# print number of rows
print('Number of rows: ', df.shape[0])

# drop rows with missing values
df = df.dropna()

# print number of rows after dropping missing values
print('Number of rows after dropping missing values: ', df.shape[0])


Number of rows:  41875
Number of rows after dropping missing values:  41875


##### Data Encoding
Multicategorical One-Hot encoding for film genres

In [59]:
# one hot encoding for genres
# print every category in movies dataset
categories = set()
for s in df['genres'].str.split('|').values:
    categories = categories.union(set(s))

# add columns for each category
for category in categories:
    df[category] = df['genres'].str.contains(category).astype(int)

# delete genres column
df = df.drop(columns=['genres'])

display(df.head())

  df[category] = df['genres'].str.contains(category).astype(int)


Unnamed: 0,movieId,title,mean_rating,rating_count,mean_timestamp,tag,Romance,Crime,Film-Noir,Drama,...,Mystery,Sci-Fi,(no genres listed),Animation,Fantasy,Thriller,Musical,Adventure,Comedy,Western
0,1,Toy Story (1995),3.893708,57309,1153152210,"[mission, 3D, é˜®ä¸€é¸£, HEROIC MISSION, joss ...",0,0,0,0,...,0,0,0,1,1,0,0,1,1,0
1,2,Jumanji (1995),3.251527,24228,1122310117,"[bad cgi, board game, time travel, Dynamic CGI...",0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
2,3,Grumpier Old Men (1995),3.142028,11804,980602256,"[Jack Lemmon, old people that is actually funn...",1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,4,Waiting to Exhale (1995),2.853547,2523,942460471,"[characters, single mother, divorce, revenge, ...",1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,5,Father of the Bride Part II (1995),3.058434,11714,1004723013,"[daughter, parent child relationship, confiden...",0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


##### Duplicates Filtering
The following results show that there are no duplicates in the dataset (as expected after merge operation).

In [60]:
df_preprocessed = df.copy()

# drop tag column TODO: find a way to use tag
df_preprocessed = df_preprocessed.drop(columns=['tag'])

# print number of rows
print('Number of rows: ', df_preprocessed.shape[0])

# drop duplicated rows
df_preprocessed = df_preprocessed.drop_duplicates()

# print number of rows after dropping missing values
print('Number of rows after dropping duplicated rows: ', df_preprocessed.shape[0])


Number of rows:  41875
Number of rows after dropping duplicated rows:  41875


## Data Visualization

In [61]:
""" # rating distribution from ratings.csv
sns.countplot(x='rating', data=ratings)
plt.show()

sns.boxplot(x='rating', data=ratings)
plt.show() """

" # rating distribution from ratings.csv\nsns.countplot(x='rating', data=ratings)\nplt.show()\n\nsns.boxplot(x='rating', data=ratings)\nplt.show() "

Rather than using discrete bins, a KDE plot smooths the observations with a Gaussian kernel, producing a continuous density estimate. This is used for continuous attributes like rating mean.

It is done to show differences after data aggregation in calculating mean for each film.

In [62]:
""" # rating distribution from df
sns.kdeplot(df['mean_rating'])
plt.xlabel('rating mean for film')
plt.title('Rating distribution in dataframe')
plt.show() """

" # rating distribution from df\nsns.kdeplot(df['mean_rating'])\nplt.xlabel('rating mean for film')\nplt.title('Rating distribution in dataframe')\nplt.show() "

Observing the distribution of all attributes for every file.

This is useful to show data trends, to reveal outliers and leverage points, to provide hints about modeling techniques to apply.


In [63]:
""" # print distribution for ratings
fig, axs = plt.subplots(3)
sns.kdeplot(ratings['timestamp'], ax=axs[0], color='r', label='timestamp')
sns.kdeplot(ratings['rating'], ax=axs[1], color='b', label='rating')
sns.kdeplot(ratings['movieId'], ax=axs[2], color='g', label='movieId')

axs[0].set_xlabel('timestamp')
axs[1].set_xlabel('rating')
axs[2].set_xlabel('movieId')
plt.show() """

" # print distribution for ratings\nfig, axs = plt.subplots(3)\nsns.kdeplot(ratings['timestamp'], ax=axs[0], color='r', label='timestamp')\nsns.kdeplot(ratings['rating'], ax=axs[1], color='b', label='rating')\nsns.kdeplot(ratings['movieId'], ax=axs[2], color='g', label='movieId')\n\naxs[0].set_xlabel('timestamp')\naxs[1].set_xlabel('rating')\naxs[2].set_xlabel('movieId')\nplt.show() "

In [64]:
# Scatter plot matrix
# Completamente inutile
# sns.pairplot(ratings)
# plt.show()

In [65]:
""" movies_to_show = 5
fig, axs = plt.subplots(movies_to_show, sharex=True, sharey=True, figsize=(10, 10))

most_rated_movies = ratings.groupby('movieId').count().sort_values('rating', ascending=False).head(movies_to_show).index

to_line_plot = []
for i in range(movies_to_show):
    tmp = ratings.where(ratings['movieId'] == most_rated_movies[i]).dropna()
    tmp['date'] = pd.to_datetime(tmp['timestamp'], unit='s')
    
    # Computing the mean for each month
    tmp = tmp.resample("M", on='date').mean()[['movieId', 'rating']].dropna()
    to_line_plot.append(tmp)
    movie_title = movies.where(movies['movieId'] == most_rated_movies[i]).dropna()['title'].values[0]
    axs[i].set_title("Movie: '" + movie_title + "'")
    

for i in range(movies_to_show):
    sns.lineplot(x='date', y='rating', ax=axs[i], data=to_line_plot[i])

fig.suptitle('Rating evolution for the 5 most rated movies')
plt.show() """

' movies_to_show = 5\nfig, axs = plt.subplots(movies_to_show, sharex=True, sharey=True, figsize=(10, 10))\n\nmost_rated_movies = ratings.groupby(\'movieId\').count().sort_values(\'rating\', ascending=False).head(movies_to_show).index\n\nto_line_plot = []\nfor i in range(movies_to_show):\n    tmp = ratings.where(ratings[\'movieId\'] == most_rated_movies[i]).dropna()\n    tmp[\'date\'] = pd.to_datetime(tmp[\'timestamp\'], unit=\'s\')\n    \n    # Computing the mean for each month\n    tmp = tmp.resample("M", on=\'date\').mean()[[\'movieId\', \'rating\']].dropna()\n    to_line_plot.append(tmp)\n    movie_title = movies.where(movies[\'movieId\'] == most_rated_movies[i]).dropna()[\'title\'].values[0]\n    axs[i].set_title("Movie: \'" + movie_title + "\'")\n    \n\nfor i in range(movies_to_show):\n    sns.lineplot(x=\'date\', y=\'rating\', ax=axs[i], data=to_line_plot[i])\n\nfig.suptitle(\'Rating evolution for the 5 most rated movies\')\nplt.show() '

In [66]:
""" # Same as above cell but the 3 movies with the highest standard deviation
movies_to_show = 3
fig, axs = plt.subplots(movies_to_show, sharex=True, sharey=True, figsize=(10, 10))

# Take movies with at least 1000 ratings
high_std_movies = ratings.where(
    ratings['movieId']
    .isin(ratings.groupby('movieId')
          .count()
          .where(ratings.groupby('movieId').count()['rating'] > 1000)
          .dropna().index)
    ).dropna()

# most_rated_movies = ratings.groupby('movieId').count().sort_values('rating', ascending=False).head(movies_to_show).index
high_std_movies = high_std_movies.groupby('movieId').std().sort_values('rating', ascending=False).head(movies_to_show).index

to_line_plot = []
for i in range(movies_to_show):
    tmp = ratings.where(ratings['movieId'] == high_std_movies[i]).dropna()
    tmp['date'] = pd.to_datetime(tmp['timestamp'], unit='s')
    
    # Computing the mean for each month
    tmp = tmp.resample("M", on='date').mean()[['movieId', 'rating']].dropna()
    to_line_plot.append(tmp)
    movie_title = movies.where(movies['movieId'] == high_std_movies[i]).dropna()['title'].values[0]
    axs[i].set_title("Movie: '" + movie_title + "'")

for i in range(movies_to_show):
    sns.lineplot(x='date', y='rating', ax=axs[i], data=to_line_plot[i])

# Print the titles of the movies
plt.show() """

' # Same as above cell but the 3 movies with the highest standard deviation\nmovies_to_show = 3\nfig, axs = plt.subplots(movies_to_show, sharex=True, sharey=True, figsize=(10, 10))\n\n# Take movies with at least 1000 ratings\nhigh_std_movies = ratings.where(\n    ratings[\'movieId\']\n    .isin(ratings.groupby(\'movieId\')\n          .count()\n          .where(ratings.groupby(\'movieId\').count()[\'rating\'] > 1000)\n          .dropna().index)\n    ).dropna()\n\n# most_rated_movies = ratings.groupby(\'movieId\').count().sort_values(\'rating\', ascending=False).head(movies_to_show).index\nhigh_std_movies = high_std_movies.groupby(\'movieId\').std().sort_values(\'rating\', ascending=False).head(movies_to_show).index\n\nto_line_plot = []\nfor i in range(movies_to_show):\n    tmp = ratings.where(ratings[\'movieId\'] == high_std_movies[i]).dropna()\n    tmp[\'date\'] = pd.to_datetime(tmp[\'timestamp\'], unit=\'s\')\n    \n    # Computing the mean for each month\n    tmp = tmp.resample("M", 

In [67]:
# TODO - compute np.unique(column_array, return_counts=True) for every attribute to visualize results
# follow the last example from this guide https://numpy.org/doc/stable/reference/generated/numpy.unique.html
 

Plot density for every attribute of dataset

In [68]:
""" # density plot for rating_count
fig, axs = plt.subplots(1,2, figsize=(10, 5))
sns.kdeplot(df['rating_count'], ax=axs[0], label='rating_count')
sns.kdeplot(df['rating_count'].where(df['rating_count'] < 50), ax=axs[1], label='rating_count < 50')

print('Number of movies with rating_count < 50: ', df.where(df['rating_count'] < 50).count()[0])
print('Number of all movies: ', df.shape[0]) """

" # density plot for rating_count\nfig, axs = plt.subplots(1,2, figsize=(10, 5))\nsns.kdeplot(df['rating_count'], ax=axs[0], label='rating_count')\nsns.kdeplot(df['rating_count'].where(df['rating_count'] < 50), ax=axs[1], label='rating_count < 50')\n\nprint('Number of movies with rating_count < 50: ', df.where(df['rating_count'] < 50).count()[0])\nprint('Number of all movies: ', df.shape[0]) "

In [69]:
# density plot for rating year

# convert timestamp to year
df['year_timestamp'] = pd.to_datetime(df['mean_timestamp'], unit='s').dt.year

# delete mean_timestamp
df = df.drop(columns=['mean_timestamp'])

""" sns.kdeplot(df['year_timestamp'])
plt.show() """

" sns.kdeplot(df['year_timestamp'])\nplt.show() "

In [70]:
""" # plot for categories
cat = list(categories)

# count the number of rows containing 1 for each category
cat_dict = {}
for category in cat:
    if category == '(no genres listed)':
        cat_dict['no genre'] = df[category].sum()
    else:
        cat_dict[category] = df[category].sum()

# plot an histogram of the number of rows for each category
fig, axs = plt.subplots(1, 1, figsize=(20, 10))
ax = sns.barplot(x=list(cat_dict.keys()), y=list(cat_dict.values()))
ax.bar_label(container=ax.containers[0], labels=list(cat_dict.keys()))
plt.xticks([])
plt.show() """


" # plot for categories\ncat = list(categories)\n\n# count the number of rows containing 1 for each category\ncat_dict = {}\nfor category in cat:\n    if category == '(no genres listed)':\n        cat_dict['no genre'] = df[category].sum()\n    else:\n        cat_dict[category] = df[category].sum()\n\n# plot an histogram of the number of rows for each category\nfig, axs = plt.subplots(1, 1, figsize=(20, 10))\nax = sns.barplot(x=list(cat_dict.keys()), y=list(cat_dict.values()))\nax.bar_label(container=ax.containers[0], labels=list(cat_dict.keys()))\nplt.xticks([])\nplt.show() "

## Data Preprocessing

In [71]:

# from title extract year
df['year'] = df['title'].str.extract('(\(\d{4}\))', expand=True)
# remove parentheses
df['year'] = df['year'].str.extract('(\d{4})', expand=True)

# remove movies with no year
df = df.dropna(subset=['year'])

# delete title column
df = df.drop(columns=['movieId', 'title'])

# convert year to int
df['year'] = df['year'].astype(int)

display(df.head())

Unnamed: 0,mean_rating,rating_count,tag,Romance,Crime,Film-Noir,Drama,Action,Children,Horror,...,(no genres listed),Animation,Fantasy,Thriller,Musical,Adventure,Comedy,Western,year_timestamp,year
0,3.893708,57309,"[mission, 3D, é˜®ä¸€é¸£, HEROIC MISSION, joss ...",0,0,0,0,0,1,0,...,0,1,1,0,0,1,1,0,2006,1995
1,3.251527,24228,"[bad cgi, board game, time travel, Dynamic CGI...",0,0,0,0,0,1,0,...,0,0,1,0,0,1,0,0,2005,1995
2,3.142028,11804,"[Jack Lemmon, old people that is actually funn...",1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,2001,1995
3,2.853547,2523,"[characters, single mother, divorce, revenge, ...",1,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,1999,1995
4,3.058434,11714,"[daughter, parent child relationship, confiden...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,2001,1995


In [72]:
""" # density plot for film year
sns.kdeplot(df['year'])
plt.show() """

" # density plot for film year\nsns.kdeplot(df['year'])\nplt.show() "

### Correlation results
The following results indicate that there is no correlation between the rating timestamp and the film year, as we thought.
There are instead correlations between genres like Children-Animation and Thriller-Horror like expected.
Other genres less important correlations are Children-Adventure, Fantasy-Adventure, Action-Adventure, Crime-Thriller and Mystery-Thriller.
There is a correlation also between something different from genres as rating_count-mean_rating.
The most interesting correlation (0.8) is mean_timestamp-movieId, which indicates that the period in which the rating was made is highly movie-dependent.


In [73]:
# convert timestamp to year
# df['year_timestamp'] = pd.to_datetime(df['mean_timestamp'], unit='s').dt.year
# year_timestamp can be removed because is less precise than timestamp seeing the correlation with movieId

# delete mean_timestamp column to avoid redundancy - TODO check if it is better to keep it
# df = df.drop(columns=['mean_timestamp'])

# compute analysis between attributes of the dataset
print(df.corr())
print(df.cov())
print(df.describe())

# follow examples from slides on Data Visualization (pages 6-8)
# TODO - write results of correlation in markdown
# TODO - show results of correlation in a graphic way
# TODO - write results of covariance in markdown
# TODO - show results of covariance in a graphic way
# TODO - write results of describe in markdown
# TODO - show results of describe in a graphic way


  print(df.corr())
  print(df.cov())


                    mean_rating  rating_count   Romance     Crime  Film-Noir  \
mean_rating            1.000000      0.132640  0.043127  0.015203   0.030073   
rating_count           0.132640      1.000000  0.026379  0.050338   0.003847   
Romance                0.043127      0.026379  1.000000 -0.055434  -0.008976   
Crime                  0.015203      0.050338 -0.055434  1.000000   0.151109   
Film-Noir              0.030073      0.003847 -0.008976  0.151109   1.000000   
Drama                  0.149897     -0.001901  0.135045  0.088269   0.053391   
Action                -0.083334      0.106123 -0.069615  0.128641  -0.024931   
Children              -0.019069      0.034384 -0.042638 -0.061397  -0.020607   
Horror                -0.224045     -0.020006 -0.113761 -0.051396  -0.025305   
IMAX                   0.014933      0.113903 -0.016811 -0.009007  -0.006221   
Documentary            0.149778     -0.053596 -0.118629 -0.086411  -0.028040   
War                    0.050338      0.0

In [74]:
# remove tag column
df = df.drop(columns=['tag'])

display(df.head())

Unnamed: 0,mean_rating,rating_count,Romance,Crime,Film-Noir,Drama,Action,Children,Horror,IMAX,...,(no genres listed),Animation,Fantasy,Thriller,Musical,Adventure,Comedy,Western,year_timestamp,year
0,3.893708,57309,0,0,0,0,0,1,0,0,...,0,1,1,0,0,1,1,0,2006,1995
1,3.251527,24228,0,0,0,0,0,1,0,0,...,0,0,1,0,0,1,0,0,2005,1995
2,3.142028,11804,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,2001,1995
3,2.853547,2523,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,1999,1995
4,3.058434,11714,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,2001,1995


In [75]:
# TODO - show scatter plot for mean_rating


In [76]:
# TODO - balancing dataset


## Modeling

In [77]:
""" # Split data
df2 = df.copy()
X = df2.drop(columns=['mean_rating'])
y = df['mean_rating']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

import tensorflow as tf

# Create the model using the Keras API of Tensorflow
model = tf.keras.Sequential([
    tf.keras.layers.Dense(4, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1)
])

# Compile the model, specifying the loss function and optimizer
model.compile(loss='mean_squared_error', metrics=['mean_squared_error'], optimizer='adam')

# Train the model on the training data
history = model.fit(X_train, y_train, epochs=100, batch_size=256, validation_split=0.2)


# Evaluate the model on the test data
loss, mse = model.evaluate(X_test, y_test, verbose=0)
print('Loss: ', loss)
print('MSE: ', mse)

display(y_test[:5])
model.predict(X_test[:5]) """

" # Split data\ndf2 = df.copy()\nX = df2.drop(columns=['mean_rating'])\ny = df['mean_rating']\n\n# Split the data into training and testing sets\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n\nimport tensorflow as tf\n\n# Create the model using the Keras API of Tensorflow\nmodel = tf.keras.Sequential([\n    tf.keras.layers.Dense(4, activation='relu', input_shape=(X_train.shape[1],)),\n    tf.keras.layers.Dense(16, activation='relu'),\n    tf.keras.layers.Dense(1)\n])\n\n# Compile the model, specifying the loss function and optimizer\nmodel.compile(loss='mean_squared_error', metrics=['mean_squared_error'], optimizer='adam')\n\n# Train the model on the training data\nhistory = model.fit(X_train, y_train, epochs=100, batch_size=256, validation_split=0.2)\n\n\n# Evaluate the model on the test data\nloss, mse = model.evaluate(X_test, y_test, verbose=0)\nprint('Loss: ', loss)\nprint('MSE: ', mse)\n\ndisplay(y_test[:5])\nmodel.predict(X_test[:5]) 

In [78]:
""" import torch.nn as nn

# Split data
df2 = df.copy()
X = df2.drop(columns=['mean_rating'])
y = df['mean_rating']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert the data to tensors
X_train = torch.tensor(X_train.values, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32)
X_test = torch.tensor(X_test.values, dtype=torch.float32)
y_test = torch.tensor(y_test.values, dtype=torch.float32)

# Create the model using the torch
model = nn.Sequential(
    nn.Linear(X_train.shape[1], 4),
    nn.ReLU(),
    nn.Linear(4, 16),
    nn.ReLU(),
    nn.Linear(16, 1)
)

# Compile the model, specifying the loss function and optimizer
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train the model on the training data X_train and y_train
epochs = 20
for epoch in range(epochs):
    print('Epoch ', epoch)
    y_pred = model(X_train)
    loss = loss_fn(y_pred, y_train)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Evaluate the model on the test data
loss = loss_fn(model(X_test), y_test)
print('Loss: ', loss.item())

display(y_test[:5]) """


" import torch.nn as nn\n\n# Split data\ndf2 = df.copy()\nX = df2.drop(columns=['mean_rating'])\ny = df['mean_rating']\n\n# Split the data into training and testing sets\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n\n# Convert the data to tensors\nX_train = torch.tensor(X_train.values, dtype=torch.float32)\ny_train = torch.tensor(y_train.values, dtype=torch.float32)\nX_test = torch.tensor(X_test.values, dtype=torch.float32)\ny_test = torch.tensor(y_test.values, dtype=torch.float32)\n\n# Create the model using the torch\nmodel = nn.Sequential(\n    nn.Linear(X_train.shape[1], 4),\n    nn.ReLU(),\n    nn.Linear(4, 16),\n    nn.ReLU(),\n    nn.Linear(16, 1)\n)\n\n# Compile the model, specifying the loss function and optimizer\nloss_fn = nn.MSELoss()\noptimizer = torch.optim.Adam(model.parameters(), lr=1e-3)\n\n# Train the model on the training data X_train and y_train\nepochs = 20\nfor epoch in range(epochs):\n    print('Epoch ', epoch)\n    