<a href="https://colab.research.google.com/github/shakirabanu20/Eda-code/blob/main/Project_eda_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Create dummy data
data = {
    'age': [23, 34, 45, 29, 31, 40, 22, 37, 50, 28],
    'average_watch_time': [2.5, 3.0, 1.5, 2.8, 2.2, 3.5, 2.1, 3.3, 1.8, 2.6],
    'rating': [4.5, 4.0, 3.5, 4.2, 4.7, 3.8, 4.1, 4.0, 3.2, 4.3],
    'gender': ['Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Other', 'Male', 'Female', 'Other'],
    'genre': ['Action', 'Comedy', 'Drama', 'Action', 'Drama', 'Comedy', 'Action', 'Horror', 'Comedy', 'Drama'],
    'location': ['NY', 'CA', 'TX', 'NY', 'TX', 'CA', 'FL', 'NY', 'CA', 'TX'],
    'user_preferences': ['High', 'Medium', 'Low', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'],
    'release_year': [2020, 2021, 2019, 2020, 2018, 2022, 2021, 2019, 2020, 2022]
}

# Convert to DataFrame and save as CSV
df_dummy = pd.DataFrame(data)
df_dummy.to_csv('movies_data.csv', index=False)

# Load dataset
df = pd.read_csv('movies_data.csv')

# Basic overview
print(df.head())
print(df.info())
print(df.describe())

# Set plotting aesthetics
sns.set(style="whitegrid")

# --- Univariate Analysis ---

# Numerical Features
numerical_cols = ['age', 'average_watch_time', 'rating']
for col in numerical_cols:
    plt.figure(figsize=(8, 4))
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f'Distribution of {col}')
    plt.show()

    plt.figure(figsize=(8, 4))
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')
    plt.show()

# Categorical Features
categorical_cols = ['gender', 'genre', 'location', 'user_preferences']
for col in categorical_cols:
    plt.figure(figsize=(10, 4))
    sns.countplot(x=df[col], order=df[col].value_counts().index)
    plt.xticks(rotation=45)
    plt.title(f'Countplot of {col}')
    plt.show()

# --- Bivariate/Multivariate Analysis ---

# Correlation Matrix
plt.figure(figsize=(10, 6))
corr = df[['age', 'average_watch_time', 'rating']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Pairplot
sns.pairplot(df[['age', 'average_watch_time', 'rating']])
plt.suptitle('Pairplot of Numerical Features', y=1.02)
plt.show()

# Scatterplots
plt.figure(figsize=(8, 5))
sns.scatterplot(x='age', y='average_watch_time', data=df)
plt.title('Age vs Average Watch Time')
plt.show()

plt.figure(figsize=(8, 5))
sns.scatterplot(x='release_year', y='rating', data=df)
plt.title('Release Year vs Rating')
plt.show()

# Grouped Boxplots
plt.figure(figsize=(12, 5))
sns.boxplot(x='genre', y='rating', data=df)
plt.xticks(rotation=45)
plt.title('Rating by Genre')
plt.show()

plt.figure(figsize=(12, 5))
sns.boxplot(x='user_preferences', y='rating', data=df)
plt.xticks(rotation=45)
plt.title('Rating by User Preferences')
plt.show()

# Average Watch Time by Genre
plt.figure(figsize=(12, 5))
sns.boxplot(x='genre', y='average_watch_time', data=df)
plt.xticks(rotation=45)
plt.title('Average Watch Time by Genre')
plt.show()