In [2]:
import pandas as pd

In [23]:
df_movies = pd.read_csv("IMDb movies.csv", low_memory=False)
df_ratings = pd.read_csv("IMDb ratings.csv", low_memory=False)

In [25]:
df_movies.columns

Index(['imdb_title_id', 'title', 'original_title', 'year', 'date_published',
       'genre', 'duration', 'country', 'language', 'director', 'writer',
       'production_company', 'actors', 'description', 'avg_vote', 'votes',
       'budget', 'usa_gross_income', 'worlwide_gross_income', 'metascore',
       'reviews_from_users', 'reviews_from_critics'],
      dtype='object')

In [26]:
df_ratings.columns

Index(['imdb_title_id', 'weighted_average_vote', 'total_votes', 'mean_vote',
       'median_vote', 'votes_10', 'votes_9', 'votes_8', 'votes_7', 'votes_6',
       'votes_5', 'votes_4', 'votes_3', 'votes_2', 'votes_1',
       'allgenders_0age_avg_vote', 'allgenders_0age_votes',
       'allgenders_18age_avg_vote', 'allgenders_18age_votes',
       'allgenders_30age_avg_vote', 'allgenders_30age_votes',
       'allgenders_45age_avg_vote', 'allgenders_45age_votes',
       'males_allages_avg_vote', 'males_allages_votes', 'males_0age_avg_vote',
       'males_0age_votes', 'males_18age_avg_vote', 'males_18age_votes',
       'males_30age_avg_vote', 'males_30age_votes', 'males_45age_avg_vote',
       'males_45age_votes', 'females_allages_avg_vote',
       'females_allages_votes', 'females_0age_avg_vote', 'females_0age_votes',
       'females_18age_avg_vote', 'females_18age_votes',
       'females_30age_avg_vote', 'females_30age_votes',
       'females_45age_avg_vote', 'females_45age_votes',
       

In [27]:
df_movies = df_movies[['imdb_title_id', 'title', 'year', 'genre', 'country']]
df_ratings = df_ratings[['imdb_title_id', 'total_votes', 'mean_vote']]  

In [28]:
df_movies

Unnamed: 0,imdb_title_id,title,year,genre,country
0,tt0000009,Miss Jerry,1894,Romance,USA
1,tt0000574,The Story of the Kelly Gang,1906,"Biography, Crime, Drama",Australia
2,tt0001892,Den sorte drøm,1911,Drama,"Germany, Denmark"
3,tt0002101,Cleopatra,1912,"Drama, History",USA
4,tt0002130,L'Inferno,1911,"Adventure, Drama, Fantasy",Italy
...,...,...,...,...,...
85850,tt9908390,Le lion,2020,Comedy,"France, Belgium"
85851,tt9911196,De Beentjes van Sint-Hildegard,2020,"Comedy, Drama",Netherlands
85852,tt9911774,Padmavyuhathile Abhimanyu,2019,Drama,India
85853,tt9914286,Sokagin Çocuklari,2019,"Drama, Family",Turkey


In [29]:
df_ratings

Unnamed: 0,imdb_title_id,total_votes,mean_vote
0,tt0000009,154,5.9
1,tt0000574,589,6.3
2,tt0001892,188,6.0
3,tt0002101,446,5.3
4,tt0002130,2237,6.9
...,...,...,...
85850,tt9908390,398,5.5
85851,tt9911196,724,7.9
85852,tt9911774,265,7.8
85853,tt9914286,194,9.4


# 1. concat()
## 1.1. Concatenate vertically
To concatenate vertically (along the rows) we should have columns in common between the 2 dataframes

In [31]:
df1 = pd.DataFrame({'id': ['A', 'B', 'C', 'D'],
                    'age': [30, 23, 25, 22]})
df1

Unnamed: 0,id,age
0,A,30
1,B,23
2,C,25
3,D,22


In [3]:
df2 = pd.DataFrame({'id': ['E', 'F', 'G', 'F'],
                    'age': [40, 21, 19, 24]})
df2 

Unnamed: 0,id,age
0,E,40
1,F,21
2,G,19
3,F,24


In [41]:
pd.concat([df1, df2], axis=0)  # axis=0 is default and it will concatenate vertically

Unnamed: 0,id,age
0,A,30
1,B,23
2,C,25
3,D,22
0,E,40
1,F,21
2,G,19
3,F,24


In [42]:
# But to reset the index (0, 1, 2, ...), we can use the ignore_index parameter
pd.concat([df1, df2], ignore_index=True)

Unnamed: 0,id,age
0,A,30
1,B,23
2,C,25
3,D,22
4,E,40
5,F,21
6,G,19
7,F,24


# Exercise
extract a 50% of the sample of the original dataframe

In [45]:
df_sample = df_movies.sample(frac=0.5)
df_sample

Unnamed: 0,imdb_title_id,title,year,genre,country
84886,tt8919396,Muklawa,2019,Drama,India
35215,tt0185027,L'invincibile Bedman,1972,"Action, Sci-Fi",Turkey
72752,tt3818552,El Vientre,2014,Thriller,Peru
15273,tt0067931,La vendetta è un piatto che si serve freddo,1971,Western,Italy
69776,tt3091138,Curveball,2015,"Drama, Sport",USA
...,...,...,...,...,...
56916,tt1320296,Shank,2010,"Action, Comedy, Crime",UK
81180,tt6774106,Pullikkaran Staraa,2017,Drama,India
50790,tt0850253,Battle in Seattle - Nessuno li può fermare,2007,"Action, Drama","Canada, USA, Germany"
75498,tt4699624,KillerSaurus,2015,"Action, Adventure, Comedy",UK


In [47]:
# shape of dataframe that we will concatenate
print(df_movies.shape)
print(df_sample.shape)

(85855, 5)
(42928, 5)


In [48]:
df_movies

Unnamed: 0,imdb_title_id,title,year,genre,country
0,tt0000009,Miss Jerry,1894,Romance,USA
1,tt0000574,The Story of the Kelly Gang,1906,"Biography, Crime, Drama",Australia
2,tt0001892,Den sorte drøm,1911,Drama,"Germany, Denmark"
3,tt0002101,Cleopatra,1912,"Drama, History",USA
4,tt0002130,L'Inferno,1911,"Adventure, Drama, Fantasy",Italy
...,...,...,...,...,...
85850,tt9908390,Le lion,2020,Comedy,"France, Belgium"
85851,tt9911196,De Beentjes van Sint-Hildegard,2020,"Comedy, Drama",Netherlands
85852,tt9911774,Padmavyuhathile Abhimanyu,2019,Drama,India
85853,tt9914286,Sokagin Çocuklari,2019,"Drama, Family",Turkey


In [49]:
df_sample

Unnamed: 0,imdb_title_id,title,year,genre,country
84886,tt8919396,Muklawa,2019,Drama,India
35215,tt0185027,L'invincibile Bedman,1972,"Action, Sci-Fi",Turkey
72752,tt3818552,El Vientre,2014,Thriller,Peru
15273,tt0067931,La vendetta è un piatto che si serve freddo,1971,Western,Italy
69776,tt3091138,Curveball,2015,"Drama, Sport",USA
...,...,...,...,...,...
56916,tt1320296,Shank,2010,"Action, Comedy, Crime",UK
81180,tt6774106,Pullikkaran Staraa,2017,Drama,India
50790,tt0850253,Battle in Seattle - Nessuno li può fermare,2007,"Action, Drama","Canada, USA, Germany"
75498,tt4699624,KillerSaurus,2015,"Action, Adventure, Comedy",UK


In [50]:
# concatenate and df_movies and df_sample (vertically along the rows)
df_concat_vertically = pd.concat([df_movies, df_sample], axis=0)
df_concat_vertically.head()

Unnamed: 0,imdb_title_id,title,year,genre,country
0,tt0000009,Miss Jerry,1894,Romance,USA
1,tt0000574,The Story of the Kelly Gang,1906,"Biography, Crime, Drama",Australia
2,tt0001892,Den sorte drøm,1911,Drama,"Germany, Denmark"
3,tt0002101,Cleopatra,1912,"Drama, History",USA
4,tt0002130,L'Inferno,1911,"Adventure, Drama, Fantasy",Italy


In [52]:
# JUST TO VERIFY THE SHAPE (NUMBER OF ROWS AND COLUMNS) to know the concatenation worked
df_concat_vertically.shape

(128783, 5)

# 1.2. Concatenate Horizontally
To concatenate horizontally (along the columns) we should have the same `number of rows` in both dataframes, and we should have `same index` in both dataframes.

In [53]:
df1 = pd.DataFrame({'id': ['A', 'B', 'C', 'D'],
                    'age': [30, 23, 25, 22]})
df2 = pd.DataFrame({'job': ['Doctor', 'Statistician',
                            'Accountant', 'Developer']})

In [54]:
df1

Unnamed: 0,id,age
0,A,30
1,B,23
2,C,25
3,D,22


In [55]:
df2

Unnamed: 0,job
0,Doctor
1,Statistician
2,Accountant
3,Developer


In [56]:
# To concatenate Horizontally
pd.concat([df1, df2], axis=1)  # axis=1 will concatenate horizontally

Unnamed: 0,id,age,job
0,A,30,Doctor
1,B,23,Statistician
2,C,25,Accountant
3,D,22,Developer


# Exercise 

In [58]:
# shape of dataframes that we will concatenate
print(df_movies.shape)
print(df_ratings.shape)

(85855, 5)
(85855, 3)


In [62]:
df_movies.set_index('imdb_title_id', inplace=True)
df_ratings.set_index('imdb_title_id', inplace=True)

In [65]:
df_movies

Unnamed: 0_level_0,title,year,genre,country
imdb_title_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tt0000009,Miss Jerry,1894,Romance,USA
tt0000574,The Story of the Kelly Gang,1906,"Biography, Crime, Drama",Australia
tt0001892,Den sorte drøm,1911,Drama,"Germany, Denmark"
tt0002101,Cleopatra,1912,"Drama, History",USA
tt0002130,L'Inferno,1911,"Adventure, Drama, Fantasy",Italy
...,...,...,...,...
tt9908390,Le lion,2020,Comedy,"France, Belgium"
tt9911196,De Beentjes van Sint-Hildegard,2020,"Comedy, Drama",Netherlands
tt9911774,Padmavyuhathile Abhimanyu,2019,Drama,India
tt9914286,Sokagin Çocuklari,2019,"Drama, Family",Turkey


In [66]:
df_ratings

Unnamed: 0_level_0,total_votes,mean_vote
imdb_title_id,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0000009,154,5.9
tt0000574,589,6.3
tt0001892,188,6.0
tt0002101,446,5.3
tt0002130,2237,6.9
...,...,...
tt9908390,398,5.5
tt9911196,724,7.9
tt9911774,265,7.8
tt9914286,194,9.4


Both of the above dataframes have same index `imdb_title_id`. 
Now Concatenate them horizontally along the columns

In [68]:
# Concatenate df_movies and df_ratings on 'imdb_title_id' (horizontally along the columns)
df_concat_horizontally = pd.concat([df_movies, df_ratings], axis=1)

In [69]:
# shape of the horizontally concatenated dataframe
df_concat_horizontally.shape
df_concat_horizontally

Unnamed: 0_level_0,title,year,genre,country,total_votes,mean_vote
imdb_title_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt0000009,Miss Jerry,1894,Romance,USA,154,5.9
tt0000574,The Story of the Kelly Gang,1906,"Biography, Crime, Drama",Australia,589,6.3
tt0001892,Den sorte drøm,1911,Drama,"Germany, Denmark",188,6.0
tt0002101,Cleopatra,1912,"Drama, History",USA,446,5.3
tt0002130,L'Inferno,1911,"Adventure, Drama, Fantasy",Italy,2237,6.9
...,...,...,...,...,...,...
tt9908390,Le lion,2020,Comedy,"France, Belgium",398,5.5
tt9911196,De Beentjes van Sint-Hildegard,2020,"Comedy, Drama",Netherlands,724,7.9
tt9911774,Padmavyuhathile Abhimanyu,2019,Drama,India,265,7.8
tt9914286,Sokagin Çocuklari,2019,"Drama, Family",Turkey,194,9.4


In [70]:
df_concat_horizontally.shape

(85855, 6)