In [1]:
import pandas as pd
import sqlite3

# Part 1: Reading Files

# 1. Reading chinook.db (SQLite)
conn = sqlite3.connect('chinook.db')
customers_df = pd.read_sql_query("SELECT * FROM customers", conn)
conn.close()
print(customers_df.head(10))

# 2. Reading iris.json (JSON)
iris_df = pd.read_json('iris.json')
print(iris_df.shape)
print(iris_df.columns)

# 3. Reading titanic.xlsx (Excel)
titanic_df = pd.read_excel('titanic.xlsx')
print(titanic_df.head())

# 4. Reading Flights parquet file (Parquet)
flights_df = pd.read_parquet('flights.parquet')
print(flights_df.info())

# 5. Reading movie.csv (CSV)
movie_df = pd.read_csv('movie.csv')
print(movie_df.sample(10))


ModuleNotFoundError: No module named 'pandas'

In [None]:
# Part 2: Exploring DataFrames

# 1. Exploring iris.json DataFrame
iris_df.columns = [col.lower() for col in iris_df.columns]
iris_selected = iris_df[['sepal_length', 'sepal_width']]
print(iris_selected.head())

# 2. Exploring titanic.xlsx DataFrame
titanic_filtered = titanic_df[titanic_df['Age'] > 30]
print(titanic_filtered)
gender_counts = titanic_df['Sex'].value_counts()
print(gender_counts)

# 3. Exploring Flights parquet DataFrame
flights_selected = flights_df[['origin', 'dest', 'carrier']]
print(flights_selected.head())
unique_destinations = flights_df['dest'].nunique()
print(f"Number of unique destinations: {unique_destinations}")

# 4. Exploring movie.csv DataFrame
long_movies = movie_df[movie_df['duration'] > 120]
sorted_long_movies = long_movies.sort_values(by='director_facebook_likes', ascending=False)
print(sorted_long_movies.head())

In [None]:
# Part 3: Challenges and Explorations

# iris.json: Statistical Analysis
print(iris_df.describe().loc[['mean', '50%', 'std']])

# titanic.xlsx: Age Analysis
print(f"Min Age: {titanic_df['Age'].min()}")
print(f"Max Age: {titanic_df['Age'].max()}")
print(f"Total Age Sum: {titanic_df['Age'].sum()}")

# movie.csv: Director Analysis
director_likes = movie_df.groupby('director_name')['director_facebook_likes'].sum().idxmax()
print(f"Director with highest total likes: {director_likes}")

longest_movies = movie_df[['title', 'director_name', 'duration']].sort_values(by='duration', ascending=False).head(5)
print(longest_movies)

# Flights parquet: Handling Missing Values
missing_counts = flights_df.isna().sum()
print(f"Missing values:\n{missing_counts}")

# Filling missing values in numerical columns with their means
for column in flights_df.select_dtypes(include=['float64', 'int64']).columns:
    flights_df[column].fillna(flights_df[column].mean(), inplace=True)# Part 3: Challenges and Explorations

# iris.json: Statistical Analysis
print(iris_df.describe().loc[['mean', '50%', 'std']])

# titanic.xlsx: Age Analysis
print(f"Min Age: {titanic_df['Age'].min()}")
print(f"Max Age: {titanic_df['Age'].max()}")
print(f"Total Age Sum: {titanic_df['Age'].sum()}")

# movie.csv: Director Analysis
director_likes = movie_df.groupby('director_name')['director_facebook_likes'].sum().idxmax()
print(f"Director with highest total likes: {director_likes}")

longest_movies = movie_df[['title', 'director_name', 'duration']].sort_values(by='duration', ascending=False).head(5)
print(longest_movies)

# Flights parquet: Handling Missing Values
missing_counts = flights_df.isna().sum()
print(f"Missing values:\n{missing_counts}")

# Filling missing values in numerical columns with their means
for column in flights_df.select_dtypes(include=['float64', 'int64']).columns:
    flights_df[column].fillna(flights_df[column].mean(), inplace=True)