In [None]:
import pandas as pd
import numpy as np 
import seaborn as sns 
import plotly.express as px 
import matplotlib.pyplot as plt 
%matplotlib inline

In [None]:
df = pd.read_csv('D:\epsilon course\datasets\horror_movies.csv')
df.head()

In [None]:
df.info()

In [None]:
round(df.describe())

In [None]:
df.columns = df.columns.str.lower()
df.columns = df.columns.str.strip()
df.columns = df.columns.str.replace(' ','_')
df

## check missing values

In [None]:
df.isna().sum()

## check duplicated values

In [None]:
df.duplicated().sum()

## convert released date to datetime format

In [None]:
df['release_date'] = pd.to_datetime(df['release_date'])

In [None]:
df.info()

## univariate analysis

In [None]:
numeric_columns = df.select_dtypes(include = np.number).columns
numeric_columns

In [None]:
for col in numeric_columns:
    sns.boxplot(data = df, x=col)
    plt.show()

In [None]:
df_num = df.select_dtypes(include='number')
plt.figure(figsize =[14,7])
sns.heatmap(df_num.corr(),annot=True)


## bivariate analysis

In [None]:
sns.scatterplot(data = df, x='release_date',y='revenue',hue='vote_count')

## creating new features

In [None]:
df['month'] = df['release_date'].dt.month
df['day'] = df['release_date'].dt.day
df['year'] = df['release_date'].dt.year

## Analysis Questions

## what is the average popularity rating of horror movies in the dataset?

In [None]:
df['popularity'].mean()

## Which horror movie has the highest revenue?



In [None]:
df['revenue'].max()

## What is the distribution of runtime for horror movies?

In [None]:
df['runtime'].describe()

## How many horror movies in the dataset are part of a collection?

In [None]:
df['collection'].count()

## Which horror movies has the highest vote average?

In [None]:
df['vote_average'].max()

## 	What is the most common original language among horror movies?

In [None]:
df['original_language'].mode()[0]

## Is there a correlation between movie popularity and vote count?

In [None]:
df['popularity'].corr(df['vote_count'])

## what is the minimum revenue?

In [None]:
df['revenue'].min()

## what is the highest 10 movies revenue?

In [None]:
df.groupby('title')['revenue'].max().sort_values(ascending=False).head(10)

## How many movies are released in each year?

In [None]:
df['year'].value_counts().sort_index()

## How many movies are part of a collection?

In [None]:
df['collection_name'].count()

## what is the total revenue of the movies?

In [None]:
df['revenue'].sum()

## how many movies are classified as horror?

In [None]:
len(df[df['genre_names'].str.contains('horror',case=False)])

## what is the tagline of the movies with the highest 10 popularity?

In [None]:
df.groupby('tagline')['popularity'].max().sort_values(ascending=False).head(10)

## How many movies were released in each month?

In [None]:
df['month'].value_counts().sort_index()

## How many movies have a budget greater than $50 million?

In [None]:
len(df[df['budget']> 100000])

## What is the earliest release date among the movies?

In [None]:
df['year'].min()

## How many movies have a runtime greater than 100 minutes?

In [None]:
len(df['runtime'] >= 100)

## How many movies are part of the Orphan Collection?

In [None]:
orphan_collection = df[df['collection_name'] == 'Orphan Collection']
len(orphan_collection)

## How many movies were released in each day of the week?

In [None]:
df['day'].value_counts().sort_index()

In [None]:
df.head()

## dealing with missing values

In [None]:
df.isna().sum()

In [None]:
df['collection_name'].fillna('',inplace=True)

In [None]:
df['overview'].fillna('Empty',inplace=True)

In [None]:
df['tagline'].fillna('Empty',inplace = True)

In [None]:
df['collection'].describe()

In [None]:
df['collection'].fillna(df['collection'].median(),inplace=True)

In [None]:
 df.to_csv('horror.csv')

In [None]:
import streamlit as st

In [None]:
%%writefile horror_movies1.py
import streamlit as st
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

st.set_page_config(layout='wide',
                  page_title = 'dashboard')

tab1, tab2, tab3 = st.tabs(['descriptive statistics', 'numerical charts', 'categorical chart'])

df = pd.read_csv(r"D:\epsilon course\datasets\horror.csv")
box = st.sidebar.checkbox('show data', False ,key =1)
num = df.describe()
cat = df.describe(include="O")



if box:
    st.header('sample data')
    st.dataframe(df.head(10))

with tab1:
    col1,col2,col3 = st.columns(3)
    with col1:
        st.subheader('categorical descriptive statistics')
        st.dataframe(cat)
        
    with col3:
        st.subheader('numerical descriptive statistics')
        st.dataframe(num)
with tab2:
    year = st.sidebar.selectbox('select year',df['year'].unique())
    vote = st.sidebar.selectbox('select vote',df['vote_average'].unique())
    col1,col2,col3 = st.columns(3)
    with col1:
        new_df = df[df['year'] == year]
        fig =  px.line(df, x='year', y='revenue', labels={'x': 'year', 'y': 'Revenue'})
        st.plotly_chart(fig,use_container_width=True)
        fig = px.bar(new_df, x ='revenue', y = 'year', color = 'vote_average',title = f'revenue for {year} year'.title())
        st.plotly_chart(fig,use_container_width=True)
        new_df1 = df[df['vote_average'] == vote]
        fig = px.scatter(new_df1, x= 'runtime', y='budget', color = 'revenue', title = f'correlation between runtime and budget {vote}'.title())
        st.plotly_chart(fig,use_container_width=True)
        
with tab3:
    movie_name = st.sidebar.selectbox('select movie name',df['original_title'].unique())
    genre_name = st.sidebar.selectbox('select genre name',df['genre_names'].unique())
    col1,col2,col3 = st.columns(3)
    with col1:
        df_new2 = df[df['title'] == movie_name]
        fig = px.scatter(df_new2, x='popularity', y='vote_average', hover_data=['title'])
        st.plotly_chart(fig,use_container_width=True)
        df_new3 = df[df['genre_names'] == genre_name]
    with col3:
        fig = px.bar(df_new3, x='genre_names', y='popularity')
        st.plotly_chart(fig,use_container_width = True)


In [None]:
! streamlit run horror_movies1.py

## Handle outliers

In [None]:
from datasist.structdata import detect_outliers
index = detect_outliers(df,0,['popularity','vote_count','runtime','revenue','budget'])
len(index)

In [None]:
df.shape

In [None]:
df.drop(index,inplace=True)

## working on recommendation system 

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
np.unique(df['original_language'])

## choose only some portion of data

In [None]:
df_2 = df.loc[(df['original_language'] == 'en') & (df['year']>2000)]
len(df_2)

In [None]:
df_3 = pd.DataFrame(df_2)
df_3.tail()

In [None]:
final_data = df_3[['title','tagline']]
final_data = final_data.set_index('title')

In [None]:
final_data.head(50)

## preprocess the data

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemmatizer = WordNetLemmatizer()

In [None]:
def preprocess_sentences(text):
    text = text.lower()
    words = nltk.word_tokenize(text)
    my_sent = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')]
    final_sent = ' '.join(my_sent)
    final_sent = final_sent.replace("n't", "not")
    final_sent = final_sent.replace("'m", "am")
    final_sent = final_sent.replace("'s", "is")
    final_sent = final_sent.replace("'re", "are")
    final_sent = final_sent.replace("'ll", "will")
    final_sent = final_sent.replace("'d", "would")
    final_sent = final_sent.replace("'ve", "have")
    return final_sent
final_data['new_tagline'] = final_data['tagline'].apply(preprocess_sentences)
final_data.head()


## using TF-IDF to vectorize preprocessed movie tagline

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_movie_id = tfidf.fit_transform((final_data["new_tagline"]))

# finding cosine similarity between vectors
from sklearn.metrics.pairwise import cosine_similarity
similarty = cosine_similarity(tfidf_movie_id, tfidf_movie_id)

In [None]:
indices = pd.Series(final_data.index)

def recommendation(title, cosine_sim = similarty):
    try:
        index = indices[indices == title].index[0]
        similarity_scores = pd.Series(cosine_sim[index]).sort_values(ascending = False)
        top_10_movies = list(similarity_scores.iloc[1:11].index)
        recommended_movies = [list(final_data.index)[i] for i in top_10_movies]
        return recommended_movies
    except:
        print("No movie name found")

In [None]:
recommendation('The Revenge of Robert')