In [7]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import plotly
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as py
from plotly.offline import iplot
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

In [8]:
df = pd.read_csv("/content/sample_data/NetflixOriginals.csv", encoding="latin-1")
df.head()

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language
0,Enter the Anime,Documentary,"August 5, 2019",58,2.5,English/Japanese
1,Dark Forces,Thriller,"August 21, 2020",81,2.6,Spanish
2,The App,Science fiction/Drama,"December 26, 2019",79,2.6,Italian
3,The Open House,Horror thriller,"January 19, 2018",94,3.2,English
4,Kaali Khuhi,Mystery,"October 30, 2020",90,3.4,Hindi


In [9]:
#shape of the data
df.shape

(584, 6)

In [10]:
#Column Counts
df.columns.value_counts()

Title         1
Genre         1
Premiere      1
Runtime       1
IMDB Score    1
Language      1
dtype: int64

In [11]:
#Check missing values
df.isnull().sum()

Title         0
Genre         0
Premiere      0
Runtime       0
IMDB Score    0
Language      0
dtype: int64

In [12]:
#data info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 584 entries, 0 to 583
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Title       584 non-null    object 
 1   Genre       584 non-null    object 
 2   Premiere    584 non-null    object 
 3   Runtime     584 non-null    int64  
 4   IMDB Score  584 non-null    float64
 5   Language    584 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 27.5+ KB


In [13]:
df["Date"] = pd.to_datetime(df.Premiere)
df["Date"]

0     2019-08-05
1     2020-08-21
2     2019-12-26
3     2018-01-19
4     2020-10-30
         ...    
579   2018-12-31
580   2015-10-09
581   2018-12-16
582   2020-12-08
583   2020-10-04
Name: Date, Length: 584, dtype: datetime64[ns]

In [14]:
df["Year"] = df["Date"].dt.year
df["Month"] = df["Date"].dt.month
df["day_of_week"] = df["Date"].dt.dayofweek
df["Year_Month"] = df["Date"].dt.strftime("%Y-%m")

In [15]:
df.head()

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language,Date,Year,Month,day_of_week,Year_Month
0,Enter the Anime,Documentary,"August 5, 2019",58,2.5,English/Japanese,2019-08-05,2019,8,0,2019-08
1,Dark Forces,Thriller,"August 21, 2020",81,2.6,Spanish,2020-08-21,2020,8,4,2020-08
2,The App,Science fiction/Drama,"December 26, 2019",79,2.6,Italian,2019-12-26,2019,12,3,2019-12
3,The Open House,Horror thriller,"January 19, 2018",94,3.2,English,2018-01-19,2018,1,4,2018-01
4,Kaali Khuhi,Mystery,"October 30, 2020",90,3.4,Hindi,2020-10-30,2020,10,4,2020-10


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 584 entries, 0 to 583
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Title        584 non-null    object        
 1   Genre        584 non-null    object        
 2   Premiere     584 non-null    object        
 3   Runtime      584 non-null    int64         
 4   IMDB Score   584 non-null    float64       
 5   Language     584 non-null    object        
 6   Date         584 non-null    datetime64[ns]
 7   Year         584 non-null    int64         
 8   Month        584 non-null    int64         
 9   day_of_week  584 non-null    int64         
 10  Year_Month   584 non-null    object        
dtypes: datetime64[ns](1), float64(1), int64(4), object(5)
memory usage: 50.3+ KB



  *  'Genre' Sütunu kaç kategoriye sahiptir ve bu kategoriler nelerdir? Görselleştirerek ifade ediniz.

In [17]:
df.Genre.nunique()

115

In [18]:
df.Genre.unique()

array(['Documentary', 'Thriller', 'Science fiction/Drama',
       'Horror thriller', 'Mystery', 'Action', 'Comedy',
       'Heist film/Thriller', 'Musical/Western/Fantasy', 'Drama',
       'Romantic comedy', 'Action comedy', 'Horror anthology',
       'Political thriller', 'Superhero-Comedy', 'Horror',
       'Romance drama', 'Anime / Short', 'Superhero', 'Heist', 'Western',
       'Animation/Superhero', 'Family film', 'Action-thriller',
       'Teen comedy-drama', 'Romantic drama', 'Animation',
       'Aftershow / Interview', 'Christmas musical',
       'Science fiction adventure', 'Science fiction', 'Variety show',
       'Comedy-drama', 'Comedy/Fantasy/Family', 'Supernatural drama',
       'Action/Comedy', 'Action/Science fiction',
       'Romantic teenage drama', 'Comedy / Musical', 'Musical',
       'Science fiction/Mystery', 'Crime drama',
       'Psychological thriller drama', 'Adventure/Comedy', 'Black comedy',
       'Romance', 'Horror comedy', 'Christian musical',
       'Rom

*Netflix veri kümesinde farklı film türleri mevcuttur. Netflix'teki 115 tür filmin tümünün adı yukarıdaki çıktıdadır.*

In [19]:
df.Genre.value_counts(normalize=True)*100

Documentary                             27.226027
Drama                                   13.184932
Comedy                                   8.390411
Romantic comedy                          6.678082
Thriller                                 5.650685
                                          ...    
Romantic comedy-drama                    0.171233
Heist film/Thriller                      0.171233
Musical/Western/Fantasy                  0.171233
Horror anthology                         0.171233
Animation/Christmas/Comedy/Adventure     0.171233
Name: Genre, Length: 115, dtype: float64

*Verilerden elimizde %27,22 belgesel film, %13,18 Drama film, %8,39 Komedi film vs var. diğer türlerden daha fazla görüntüleme.*

In [20]:
genre = df.Genre.value_counts().nlargest(20)
genre

Documentary                 159
Drama                        77
Comedy                       49
Romantic comedy              39
Thriller                     33
Comedy-drama                 14
Crime drama                  11
Biopic                        9
Horror                        9
Action                        7
Romance                       6
Concert Film                  6
Aftershow / Interview         6
Animation                     5
Action comedy                 5
Romantic drama                5
Psychological thriller        4
Science fiction/Thriller      4
Variety show                  4
Science fiction               4
Name: Genre, dtype: int64

In [21]:
fig = px.bar(data_frame=genre, x=genre.index, y=genre.values, labels={"y":"Number of Movies from the Genre", "index":"Genres"})
fig.update_layout(xaxis={"categoryorder":"total descending"})

fig.show()

  Netflix filmlerinin Dil Analizi
  * İngilizce çekilen filmler içerisinde hangi tür en yüksek IMDB puanına sahiptir?
  * Veri setinde bulunan filmlerde en çok kullanılan 3 dili bulunuz.

In [22]:
df.Language.unique()

array(['English/Japanese', 'Spanish', 'Italian', 'English', 'Hindi',
       'Turkish', 'Korean', 'Indonesian', 'Malay', 'Dutch', 'French',
       'English/Spanish', 'Portuguese', 'Filipino', 'German', 'Polish',
       'Norwegian', 'Marathi', 'Thai', 'Swedish', 'Japanese',
       'Spanish/Basque', 'Spanish/Catalan', 'English/Swedish',
       'English/Taiwanese/Mandarin', 'Thia/English', 'English/Mandarin',
       'Georgian', 'Bengali', 'Khmer/English/French', 'English/Hindi',
       'Tamil', 'Spanish/English', 'English/Korean', 'English/Arabic',
       'English/Russian', 'English/Akan', 'English/Ukranian/Russian'],
      dtype=object)

In [23]:
df.Language.value_counts()

English                       401
Hindi                          33
Spanish                        31
French                         20
Italian                        14
Portuguese                     12
Indonesian                      9
Japanese                        6
Korean                          6
German                          5
Turkish                         5
English/Spanish                 5
Polish                          3
Dutch                           3
Marathi                         3
English/Hindi                   2
Thai                            2
English/Mandarin                2
English/Japanese                2
Filipino                        2
English/Russian                 1
Bengali                         1
English/Arabic                  1
English/Korean                  1
Spanish/English                 1
Tamil                           1
English/Akan                    1
Khmer/English/French            1
Swedish                         1
Georgian      

In [24]:
df.Language.nunique()

38

In [25]:
top_20_lang = df.Language.value_counts().nlargest(20)
fig = px.bar(data_frame=top_20_lang, x=top_20_lang.index, y=top_20_lang.values, labels={"y":"Count", "index":"Language"})
fig.update_layout(xaxis={"categoryorder":"total descending"})

fig.show()

**Runtime Analizi**
 * 'Runtime' değeri en yüksek olan ilk 10 film hangileridir? Görselleştiriniz.

In [26]:
df.Runtime.describe()

count    584.000000
mean      93.577055
std       27.761683
min        4.000000
25%       86.000000
50%       97.000000
75%      108.000000
max      209.000000
Name: Runtime, dtype: float64

*Ortalama değerin ortanca değerden küçük olduğunu görürsünüz. Ortalama, medyandan küçükse, dağılım negatif çarpıktır ve bunun tersi de geçerlidir.*

In [27]:
fig = px.histogram(data_frame=df, x="Runtime", title="Runtime of Programs")

fig.show()

*Yukarıdaki histogramla, Runtime dağılımının (negatif olarak) çarpık bırakıldığını doğruladık.*

In [28]:
fig = px.box(data_frame=df, x="Runtime", hover_data=df[["Title", "Genre"]])
fig.update_traces(overwrite=True)

fig.show()

*En yüksek Runtime'a sahip film*

In [29]:
df[df.Runtime == df.Runtime.max()]["Title"]

561    The Irishman
Name: Title, dtype: object

*Veri kümesinden en düşük Runtime'a sahip film*

In [30]:
df[df.Runtime == df.Runtime.min()]["Title"]

40    Sol Levante
Name: Title, dtype: object

**IMDB Puanına Göre Analiz**
  * IMDB puanı en yüksek olan ilk 10 film hangileridir?
  * IMDB puanı ile 'Runtime' arasında nasıl bir korelasyon vardır? İnceleyip görselleştiriniz.
  * IMDB Puanı en yüksek olan ilk 10 'Genre' hangileridir? Görselleştiriniz


In [31]:
df["IMDB Score"].describe()

count    584.000000
mean       6.271747
std        0.979256
min        2.500000
25%        5.700000
50%        6.350000
75%        7.000000
max        9.000000
Name: IMDB Score, dtype: float64

*En yüksek puanlı film 9. En düşük puan 2.5. Yine medyan, ortalamadan biraz daha büyüktür. Yani dağılım negatif çarpıktır.*

In [32]:
fig = px.histogram(data_frame=df, x=df["IMDB Score"], title="IMDB Scores of the Programs")

fig.show()

*Yukarıda şekilde görüldüğü gibi dağılım negatif(sola) çarpıktır.*

In [33]:
fig = px.box(data_frame=df, x=df["IMDB Score"], hover_data=df[["Title", "Genre"]])
fig.update_traces(overwrite=False)

fig.show()

*Veri kümesinden en yüksek puana sahip film.*

In [34]:
df[df["IMDB Score"] == df["IMDB Score"].max()][["Title", "Genre"]]

Unnamed: 0,Title,Genre
583,David Attenborough: A Life on Our Planet,Documentary


*Veri kümesinden en düşük derecelendirmeye sahip film.*

In [35]:
df[df["IMDB Score"] == df["IMDB Score"].min()][["Title", "Genre"]]

Unnamed: 0,Title,Genre
0,Enter the Anime,Documentary


**Runtime ve IMDB Derecelendirmesi Arasındaki İlişki**

In [36]:
df["Runtime"].corr(df["IMDB Score"])

-0.04089629142078858

In [37]:
df[["IMDB Score", "Runtime"]].corr()

Unnamed: 0,IMDB Score,Runtime
IMDB Score,1.0,-0.040896
Runtime,-0.040896,1.0


*Mükemmel. Şimdi bir dağılım grafiği kullanarak Runtime ve IMDB puanına bir göz atalım*

In [38]:
fig = px.scatter(data_frame=df, x="IMDB Score", y="Runtime")
fig.update_layout(autosize=False, width=800, height=600,)

fig.show()

**Yıl bazında analiz**
 * **Hangi yılda en fazla film yayımlanmıştır? Görselleştiriniz.**

In [39]:
year = df.Year.value_counts()
year

2020    183
2019    125
2018     99
2021     71
2017     66
2016     30
2015      9
2014      1
Name: Year, dtype: int64

In [40]:
fig = px.bar(data_frame=df, x=year.index, y=year.values, labels={"y":"Movies", "x":"Year"})
fig.update_layout(xaxis={'categoryorder':'total descending'})

fig.show()

*Görüldüğü gibi Netflix'te her yıl film sayısı artıyor. 2021 tamamlanmadı, bu nedenle grafik düştü.*

**Ay bazında analiz**

In [41]:
month = df.Month.value_counts(sort=False)
month

8     37
12    51
1     37
10    77
11    57
6     35
3     48
5     53
4     63
9     53
2     39
7     34
Name: Month, dtype: int64

In [42]:
months = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']

fig = px.bar(data_frame=df, x=months, y=month.values, labels={"y":"Movies", "x":"Month"})

fig.show()

*Ekim ve Nisan, Netflix'te en yüksek sayıda film yayınına sahip. Yani, bir Netflix aboneliği satın almayı planlıyorsanız, bu ikisi sizin aylarınızdır.*

  *  IMDB Puanı en yüksek olan ilk 10 'Genre' hangileridir? Görselleştiriniz.

In [43]:
df.groupby("Genre")["IMDB Score"].nlargest(10)

Genre            
Action        372    6.7
              277    6.3
              220    6.1
              153    5.7
              50     4.9
                    ... 
War-Comedy    219    6.0
Western       516    7.3
              237    6.1
              49     4.8
Zombie/Heist  194    5.9
Name: IMDB Score, Length: 272, dtype: float64

In [45]:
top_10_ratings_by_genre = df.groupby("Genre")["IMDB Score"].mean().nlargest(10)
top_10_ratings_by_genre

Genre
Animation/Christmas/Comedy/Adventure    8.200000
Musical / Short                         7.700000
Concert Film                            7.633333
Anthology/Dark comedy                   7.600000
Animation / Science Fiction             7.500000
Making-of                               7.450000
Action-adventure                        7.300000
Coming-of-age comedy-drama              7.200000
Drama-Comedy                            7.200000
Historical drama                        7.200000
Name: IMDB Score, dtype: float64

In [46]:
fig = px.bar(data_frame=top_10_ratings_by_genre, x=top_10_ratings_by_genre.index, y=top_10_ratings_by_genre.values, 
             labels={'y':'Average Rating Score', 'x':'Genre'})

fig.show()

 * **'Runtime' değeri en yüksek olan ilk 10 film hangileridir? Görselleştiriniz.**

In [47]:
top_10_ratings = df[["IMDB Score", "Title", "Genre", "Year", "Language"]].sort_values(["IMDB Score"], ascending=False)[:10]
top_10_ratings

Unnamed: 0,IMDB Score,Title,Genre,Year,Language
583,9.0,David Attenborough: A Life on Our Planet,Documentary,2020,English
582,8.6,Emicida: AmarElo - It's All For Yesterday,Documentary,2020,Portuguese
581,8.5,Springsteen on Broadway,One-man show,2018,English
580,8.4,Winter on Fire: Ukraine's Fight for Freedom,Documentary,2015,English/Ukranian/Russian
579,8.4,Taylor Swift: Reputation Stadium Tour,Concert Film,2018,English
578,8.4,Ben Platt: Live from Radio City Music Hall,Concert Film,2020,English
577,8.3,Dancing with the Birds,Documentary,2019,English
576,8.3,Cuba and the Cameraman,Documentary,2017,English
573,8.2,Klaus,Animation/Christmas/Comedy/Adventure,2019,English
571,8.2,13th,Documentary,2016,English


In [48]:
fig = px.scatter(top_10_ratings, y= 'Title', x='IMDB Score', hover_data = top_10_ratings[['Genre','Year','Language']], color='Genre', 
                 title = "Top 10 High Rated Programs")

fig.show()