In [2]:
import csv

# movie analysis using plain python

def calculate_rating_stars(data, industry=None):
  ratings = []
  for row in data:
    if row[3] != 'NULL' and (not industry or row[1] == industry):
      ratings.append(float(row[3]))
  max_rating = max(ratings)
  min_rating = min(ratings)
  avg_rating = sum(ratings) / len(ratings)
  return max_rating, min_rating, avg_rating

with open("movies.csv") as f:
  data = list(csv.reader(f))
  header = data[0] # headers
  data = data[1:] # actual data

  max_rating, min_rating, avg_rating = calculate_rating_stars(data)
  print(f"All records: Min rating = {min_rating}, Max rating = {max_rating}, Avg rating = {avg_rating}")
  
  max_rating, min_rating, avg_rating = calculate_rating_stars(data, industry="Bollywood")
  print(f"Bollywood: Min rating = {min_rating}, Max rating = {max_rating}, Avg rating = {avg_rating}")
  
  max_rating, min_rating, avg_rating = calculate_rating_stars(data, industry="Hollywood")
  print(f"Hollywood: Min rating = {min_rating}, Max rating = {max_rating}, Avg rating = {avg_rating}")

All records: Min rating = 1.9, Max rating = 9.3, Avg rating = 7.919444444444445
Bollywood: Min rating = 1.9, Max rating = 8.4, Avg rating = 7.656250000000001
Hollywood: Min rating = 6.8, Max rating = 9.3, Avg rating = 8.130000000000003


In [7]:
import pandas as pd

# movie analysis using pandas

data_frame = pd.read_csv("movies.csv")

# you can print a defined number of rows with head (from the top)
data_frame.head(5)

Unnamed: 0,title,industry,release_year,imdb_rating,studio,budget,revenue,unit,currency,language
0,Pather Panchali,Bollywood,1955,8.3,Government of West Bengal,70000.0,100000.0,Thousands,INR,Bengali
1,Doctor Strange in the Multiverse of Madness,Hollywood,2022,7.0,Marvel Studios,200.0,954.8,Millions,USD,English
2,Thor: The Dark World,Hollywood,2013,6.8,Marvel Studios,165.0,644.8,Millions,USD,English
3,Thor: Ragnarok,Hollywood,2017,7.9,Marvel Studios,180.0,854.0,Millions,USD,English
4,Thor: Love and Thunder,Hollywood,2022,6.8,Marvel Studios,250.0,670.0,Millions,USD,English


In [8]:
# you can print a defined number of rows with tail (from the bottom)
data_frame.tail(5)

Unnamed: 0,title,industry,release_year,imdb_rating,studio,budget,revenue,unit,currency,language
32,Shershaah,Bollywood,2021,8.4,Dharma Productions,500.0,950.0,Millions,INR,Hindi
33,K.G.F: Chapter 2,Bollywood,2022,8.4,Hombale Films,1.0,12.5,Billions,INR,Kannada
34,Pushpa: The Rise - Part 1,Bollywood,2021,7.6,Mythri Movie Makers,2.0,3.6,Billions,INR,Telugu
35,RRR,Bollywood,2022,8.0,DVV Entertainment,5.5,12.0,Billions,INR,Telugu
36,Baahubali: The Beginning,Bollywood,2015,8.0,Arka Media Works,1.8,6.5,Billions,INR,Telugu


In [9]:
# you can print a random number of rows with sample
data_frame.sample(5)

Unnamed: 0,title,industry,release_year,imdb_rating,studio,budget,revenue,unit,currency,language
8,Gladiator,Hollywood,2000,8.5,Universal Pictures,103.0,460.5,Millions,USD,English
34,Pushpa: The Rise - Part 1,Bollywood,2021,7.6,Mythri Movie Makers,2.0,3.6,Billions,INR,Telugu
22,3 Idiots,Bollywood,2009,8.4,Vinod Chopra Films,550.0,4000.0,Millions,INR,Hindi
27,PK,Bollywood,2014,8.1,Vinod Chopra Films,850.0,8540.0,Millions,INR,Hindi
9,Titanic,Hollywood,1997,7.9,Paramount Pictures,200.0,2202.0,Millions,USD,English


In [10]:
# you can print range print [2:6] will not include 6
data_frame[2:6]

Unnamed: 0,title,industry,release_year,imdb_rating,studio,budget,revenue,unit,currency,language
2,Thor: The Dark World,Hollywood,2013,6.8,Marvel Studios,165.0,644.8,Millions,USD,English
3,Thor: Ragnarok,Hollywood,2017,7.9,Marvel Studios,180.0,854.0,Millions,USD,English
4,Thor: Love and Thunder,Hollywood,2022,6.8,Marvel Studios,250.0,670.0,Millions,USD,English
5,The Shawshank Redemption,Hollywood,1994,9.3,Castle Rock Entertainment,25.0,73.3,Millions,USD,English


In [11]:
# to print the shape rows and columns (37, 10)
data_frame.shape

(37, 10)

In [14]:
# to print a column data with range
data_frame[2:6]["imdb_rating"]
# or like property
# data_frame.imdb_rating

2    6.8
3    7.9
4    6.8
5    9.3
Name: imdb_rating, dtype: float64

In [15]:
# to check the available function 
dir(data_frame.imdb_rating)

['T',
 '_AXIS_LEN',
 '_AXIS_ORDERS',
 '_AXIS_TO_AXIS_NUMBER',
 '_HANDLED_TYPES',
 '__abs__',
 '__add__',
 '__and__',
 '__annotations__',
 '__array__',
 '__array_priority__',
 '__array_ufunc__',
 '__bool__',
 '__class__',
 '__column_consortium_standard__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__imod__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pandas_priority__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__

In [16]:
# with pandas a simply way to do same of all that python code
data_frame.imdb_rating.min(), data_frame.imdb_rating.max(), data_frame.imdb_rating.mean()

(np.float64(1.9), np.float64(9.3), np.float64(7.919444444444445))

In [18]:
# filtering by industry
bollywood_df = data_frame[data_frame.industry == "Bollywood"]


In [22]:
# using filtered data
bollywood_df.imdb_rating.min(), bollywood_df.imdb_rating.max(), bollywood_df.imdb_rating.mean()

(np.float64(1.9), np.float64(8.4), np.float64(7.656250000000001))

In [24]:
hollywood_df = data_frame[data_frame.industry == "Hollywood"]
hollywood_df.imdb_rating.min(), hollywood_df.imdb_rating.max(), hollywood_df.imdb_rating.mean()

(np.float64(6.8), np.float64(9.3), np.float64(8.130000000000003))

In [26]:
# the same code we write on plain python with pandas:
df = pd.read_csv("movies.csv")
print("All movies: Min=", df.imdb_rating.min(), "Max=", df.imdb_rating.max(), "Avg=", df.imdb_rating.mean())

df_b = df[df.industry=="Bollywood"]
df_h = df[df.industry=="Hollywood"]

print("Bollywood: Min=", df_b.imdb_rating.min(), "Max=", df_b.imdb_rating.max(), "Avg=", df_b.imdb_rating.mean())
print("Hollywood: Min=", df_h.imdb_rating.min(), "Max=", df_h.imdb_rating.max(), "Avg=", df_h.imdb_rating.mean())

All movies: Min= 1.9 Max= 9.3 Avg= 7.919444444444445
Bollywood: Min= 1.9 Max= 8.4 Avg= 7.656250000000001
Hollywood: Min= 6.8 Max= 9.3 Avg= 8.130000000000003
