In [1]:
import pandas as pd
from langdetect import detect

df = pd.read_csv("GroupLensData_reduced.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18836 entries, 0 to 18835
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      18836 non-null  int64  
 1   title           18836 non-null  object 
 2   genres          18836 non-null  object 
 3   Year            18836 non-null  float64
 4   average_rating  18836 non-null  float64
 5   rating_count    18836 non-null  int64  
dtypes: float64(2), int64(2), object(2)
memory usage: 883.1+ KB


In [2]:
# Define a function to detect the language of a given text
def detect_language(text):
    try:
        return detect(text)
    except:
        return 'unknown'

# Apply the detect_language function to the 'title' column
df['Language'] = df['title'].apply(detect_language)

# Print the unique languages detected
print(df['Language'].unique())

['en' 'sv' 'it' 'ca' 'fr' 'af' 'sw' 'no' 'pt' 'tr' 'da' 'so' 'et' 'es'
 'sq' 'ro' 'de' 'tl' 'nl' 'id' 'sl' 'cy' 'pl' 'hr' 'unknown' 'lt' 'sk'
 'fi' 'hu' 'vi' 'lv' 'cs' 'bg' 'el' 'ru' 'ja' 'ko' 'mk']


In [3]:
# Filter the DataFrame to keep only rows where the language of the title is 'en'
df_en = df[df['Language'] == 'en']

In [4]:
df_en.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11235 entries, 0 to 18833
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      11235 non-null  int64  
 1   title           11235 non-null  object 
 2   genres          11235 non-null  object 
 3   Year            11235 non-null  float64
 4   average_rating  11235 non-null  float64
 5   rating_count    11235 non-null  int64  
 6   Language        11235 non-null  object 
dtypes: float64(2), int64(2), object(3)
memory usage: 702.2+ KB


In [5]:
df_en.head(15)

Unnamed: 0.1,Unnamed: 0,title,genres,Year,average_rating,rating_count,Language
0,470498,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX,2010.0,4.44,8817,en
3,306510,Eternal Sunshine of the Spotless Mind (2004),Drama|Romance|Sci-Fi,2004.0,4.51,4507,en
9,233346,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,2008.0,4.38,3598,en
10,758528,"Prestige, The (2006)",Drama|Mystery|Sci-Fi|Thriller,2006.0,4.34,3598,en
12,593626,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy,2001.0,4.45,3372,en
13,483369,Inglourious Basterds (2009),Action|Drama|War,2009.0,4.19,3278,en
21,381048,"Grand Budapest Hotel, The (2014)",Comedy|Drama,2014.0,4.25,2911,en
25,596998,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy,2003.0,4.48,2845,en
26,977847,The Martian (2015),Adventure|Drama|Sci-Fi,2015.0,4.17,2809,en
27,262388,Django Unchained (2012),Action|Drama|Western,2012.0,4.3,2794,en


In [6]:
# Remove the 'Language' column from the filtered DataFrame df_en, since the column is no longer needed.
df_en.drop(columns=['Language'], inplace=True)
df_en.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_en.drop(columns=['Language'], inplace=True)


Unnamed: 0.1,Unnamed: 0,title,genres,Year,average_rating,rating_count
0,470498,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX,2010.0,4.44,8817
3,306510,Eternal Sunshine of the Spotless Mind (2004),Drama|Romance|Sci-Fi,2004.0,4.51,4507
9,233346,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,2008.0,4.38,3598
10,758528,"Prestige, The (2006)",Drama|Mystery|Sci-Fi|Thriller,2006.0,4.34,3598
12,593626,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy,2001.0,4.45,3372


In [11]:
# Split the 'genres' column by the delimiter '|' and create a list of genres
genre_lists = df_en['genres'].str.split('|')

# Create a set to store unique genres
unique_genres = set()

# Iterate over each list of genres and add them to the set of unique genres
for genre_list in genre_lists:
    unique_genres.update(genre_list)

# Print the unique genres
print("Unique genres:", unique_genres)
print("Number of unique genres:", len(unique_genres))

Unique genres: {'Animation', 'Adventure', 'Mystery', 'War', 'Drama', '(no genres listed)', 'Action', 'Romance', 'Film-Noir', 'Musical', 'Comedy', 'Thriller', 'Crime', 'Documentary', 'Children', 'Sci-Fi', 'IMAX', 'Western', 'Fantasy', 'Horror'}
Number of unique genres: 20


In [10]:
# Split the 'genres' column by the delimiter '|' and explode it into separate rows
df_genres = df_en['genres'].str.split('|', expand=True).stack()

# Count the occurrences of each genre
genre_counts = df_genres.value_counts()

# Print the genre counts
print("Genre Counts:")
print(genre_counts)

Genre Counts:
Drama                 4674
Comedy                3182
Thriller              2027
Documentary           1610
Action                1363
Romance               1343
Horror                1318
Crime                  894
Adventure              878
Sci-Fi                 799
Fantasy                674
Animation              671
Mystery                608
Children               566
(no genres listed)     381
War                    271
Musical                136
IMAX                   105
Western                 87
Film-Noir               11
Name: count, dtype: int64


In [13]:
genres_to_remove = ["Film-Noir", "War", "(no genres listed)"]
df_en['genres'] = df_en['genres'].apply(lambda x: '|'.join([genre for genre in x.split('|') if genre not in genres_to_remove]))
df_en.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_en['genres'] = df_en['genres'].apply(lambda x: '|'.join([genre for genre in x.split('|') if genre not in genres_to_remove]))


Unnamed: 0.1,Unnamed: 0,title,genres,Year,average_rating,rating_count
0,470498,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX,2010.0,4.44,8817
3,306510,Eternal Sunshine of the Spotless Mind (2004),Drama|Romance|Sci-Fi,2004.0,4.51,4507
9,233346,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,2008.0,4.38,3598
10,758528,"Prestige, The (2006)",Drama|Mystery|Sci-Fi|Thriller,2006.0,4.34,3598
12,593626,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy,2001.0,4.45,3372


In [16]:
print(df_en['genres'])

0        Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX
3                                   Drama|Romance|Sci-Fi
9                                Action|Crime|Drama|IMAX
10                         Drama|Mystery|Sci-Fi|Thriller
12                                     Adventure|Fantasy
                              ...                       
18828                                    Horror|Thriller
18830                                      Drama|Romance
18831                                             Horror
18832                                             Comedy
18833                           Action|Adventure|Western
Name: genres, Length: 11235, dtype: object


In [18]:
df_en=df_en.reset_index()
df_en.head()

Unnamed: 0.1,level_0,index,Unnamed: 0,title,genres,Year,average_rating,rating_count
0,0,0,470498,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX,2010.0,4.44,8817
1,1,3,306510,Eternal Sunshine of the Spotless Mind (2004),Drama|Romance|Sci-Fi,2004.0,4.51,4507
2,2,9,233346,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,2008.0,4.38,3598
3,3,10,758528,"Prestige, The (2006)",Drama|Mystery|Sci-Fi|Thriller,2006.0,4.34,3598
4,4,12,593626,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy,2001.0,4.45,3372


In [19]:
# Drop the columns 'level_0', 'index', and 'Unnamed: 0'
df_en.drop(columns=['level_0', 'index', 'Unnamed: 0'], inplace=True)
df_en.head()

Unnamed: 0,title,genres,Year,average_rating,rating_count
0,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX,2010.0,4.44,8817
1,Eternal Sunshine of the Spotless Mind (2004),Drama|Romance|Sci-Fi,2004.0,4.51,4507
2,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,2008.0,4.38,3598
3,"Prestige, The (2006)",Drama|Mystery|Sci-Fi|Thriller,2006.0,4.34,3598
4,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy,2001.0,4.45,3372


In [20]:
# Define the list of genres
genres = ['Animation', 'Adventure',
          'Mystery', 'Drama',
          'Action', 'Romance',
          'Musical', 'Comedy',
          'Thriller', 'Crime',
          'Documentary', 'Children',
          'Sci-Fi', 'IMAX',
          'Western', 'Fantasy', 'Horror']

# Iterate over each genre and create a new column for each
for genre in genres:
    # Create a new column for the genre and set its value to 1 if the genre is present in the 'genres' column, otherwise 0
    df_en[genre] = df_en['genres'].str.contains(genre).astype(int)
df_en.head()

Unnamed: 0,title,genres,Year,average_rating,rating_count,Animation,Adventure,Mystery,Drama,Action,...,Comedy,Thriller,Crime,Documentary,Children,Sci-Fi,IMAX,Western,Fantasy,Horror
0,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX,2010.0,4.44,8817,0,0,1,1,1,...,0,1,1,0,0,1,1,0,0,0
1,Eternal Sunshine of the Spotless Mind (2004),Drama|Romance|Sci-Fi,2004.0,4.51,4507,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
2,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,2008.0,4.38,3598,0,0,0,1,1,...,0,0,1,0,0,0,1,0,0,0
3,"Prestige, The (2006)",Drama|Mystery|Sci-Fi|Thriller,2006.0,4.34,3598,0,0,1,1,0,...,0,1,0,0,0,1,0,0,0,0
4,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy,2001.0,4.45,3372,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [21]:
df_en.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11235 entries, 0 to 11234
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           11235 non-null  object 
 1   genres          11235 non-null  object 
 2   Year            11235 non-null  float64
 3   average_rating  11235 non-null  float64
 4   rating_count    11235 non-null  int64  
 5   Animation       11235 non-null  int32  
 6   Adventure       11235 non-null  int32  
 7   Mystery         11235 non-null  int32  
 8   Drama           11235 non-null  int32  
 9   Action          11235 non-null  int32  
 10  Romance         11235 non-null  int32  
 11  Musical         11235 non-null  int32  
 12  Comedy          11235 non-null  int32  
 13  Thriller        11235 non-null  int32  
 14  Crime           11235 non-null  int32  
 15  Documentary     11235 non-null  int32  
 16  Children        11235 non-null  int32  
 17  Sci-Fi          11235 non-null 

In [22]:
# Drop the 'genres' column
df_en.drop(columns=['genres'], inplace=True)

# Save the DataFrame to a CSV file named 'data_for_ml.csv'
df_en.to_csv('data_for_ml.csv', index=False)