In [4]:
!pip install PySpark
!pip install findspark

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [54]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import expr
from pyspark.sql.functions import col

# Create a SparkSession
spark = SparkSession.builder \
    .appName("Loads Data with PySpark") \
    .getOrCreate()

# Read data from CSV file
data_path = "/content/Book_ipl22_ver_33.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)

# Perform transformations or data processing as needed
selected_df = df.select("team1", "team2", "won_by", "highscore")
filtered_df = df.filter(df["highscore"] > 60)

grouped_df = df.groupBy("match_id").agg(
    F.avg("highscore").alias("first_ings_score"),
    F.max("first_ings_wkts").alias("match_winner")
)

df_with_new_column = df.withColumn("double_runs", expr("highscore * 2"))

team_runs_df = df.groupBy("team1").agg(
    F.sum("highscore").alias("total_runs")
)
df_with_new_column.show()

# Handling Missing Values
df_cleaned = df.dropna()  # Drop rows with any missing values

# Removing Duplicates
df_cleaned = df_cleaned.dropDuplicates()

# Converting Data Types (if needed)
df_cleaned = df_cleaned.withColumn("date", col("date").cast("date"))  # Convert "date" column to DateType

# Show the cleaned data
df_cleaned.show()

# Specify the output path for the Parquet file
output_path_parquet = "/content/output"

# Write the cleaned DataFrame to Parquet format
df_cleaned.write.mode("overwrite").parquet(output_path_parquet)

# Stop the SparkSession
spark.stop()


+--------+-------------+--------------------+---------+---------+-----+-----------+-------------+----------------+---------------+-----------------+----------------+------------+-------+------+-------------------+-------------------+---------+-----------------+-------------------+-----------+
|match_id|         date|               venue|    team1|    team2|stage|toss_winner|toss_decision|first_ings_score|first_ings_wkts|second_ings_score|second_ings_wkts|match_winner| won_by|margin|player_of_the_match|         top_scorer|highscore|     best_bowling|best_bowling_figure|double_runs|
+--------+-------------+--------------------+---------+---------+-----+-----------+-------------+----------------+---------------+-----------------+----------------+------------+-------+------+-------------------+-------------------+---------+-----------------+-------------------+-----------+
|       1|March 26,2022|Wankhede Stadium,...|  Chennai|  Kolkata|Group|    Kolkata|        Field|             131|    

In [55]:
!pip install dash


Collecting dash
  Downloading dash-2.11.1-py3-none-any.whl (10.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m
Collecting Werkzeug<2.3.0 (from dash)
  Downloading Werkzeug-2.2.3-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
Collecting dash-html-components==2.0.0 (from dash)
  Downloading dash_html_components-2.0.0-py3-none-any.whl (4.1 kB)
Collecting dash-core-components==2.0.0 (from dash)
  Downloading dash_core_components-2.0.0-py3-none-any.whl (3.8 kB)
Collecting dash-table==5.0.0 (from dash)
  Downloading dash_table-5.0.0-py3-none-any.whl (3.9 kB)
Collecting retrying (from dash)
  Downloading retrying-1.3.4-py3-none-any.whl (11 kB)
Collecting ansi2html (from dash)
  Downloading ansi2html-1.8.0-py3-none-any.whl (16 kB)
Installing collected packages: dash-table, dash-html-components, dash-core-components, W

In [60]:
import pandas as pd
import dash
from dash import dcc
from dash import html
from dash.dependencies import Input, Output


# Load the Parquet data into a pandas DataFrame
parquet_path = "/content/output"
df_parquet = pd.read_parquet(parquet_path)

# Create a Dash app
app = dash.Dash(__name__)

# Layout of the Dash app
app.layout = html.Div([
    # Add your interactive components and visualizations here
    dcc.Dropdown(
        id='team-dropdown',
        options=[{'label': team, 'value': team} for team in df_parquet['team1'].unique()],
        value=df_parquet['team1'].unique()[0]
    ),
    dcc.Graph(id='highscore-graph')
])

# Callback to update the highscore graph based on selected team
@app.callback(
    Output('highscore-graph', 'figure'),
    Input('team-dropdown', 'value')
)
def update_highscore_graph(selected_team):
    filtered_df = df_parquet[df_parquet['team1'] == selected_team]
    # Perform any additional data processing or visualizations here
    # ...

    # Return the updated figure
    # For example:
    figure = {
        'data': [
            {'x': filtered_df['date'], 'y': filtered_df['highscore'], 'type': 'line', 'name': 'Highscore'},
        ],
        'layout': {
            'title': f'Highscore for {selected_team}'
        }
    }
    return figure

if __name__ == '__main__':
    app.run_server(debug=True)


<IPython.core.display.Javascript object>

In [58]:
import pandas as pd
import dash
from dash import dcc
from dash import html
from dash.dependencies import Input, Output

# Load the Parquet data into a pandas DataFrame
parquet_path = "/content/output"
df_parquet = pd.read_parquet(parquet_path)

# Data Processing: Calculate the average highscore and total matches played for each team
team_stats = df_parquet.groupby('team1').agg({
    'highscore': 'mean',
    'match_id': 'count'
}).reset_index()
team_stats.columns = ['Team', 'Average Highscore', 'Total Matches']

# Create a Dash app
app = dash.Dash(__name__)

# Layout of the Dash app
app.layout = html.Div([
    # Add your interactive components and visualizations here
    dcc.Dropdown(
        id='team-dropdown',
        options=[{'label': team, 'value': team} for team in df_parquet['team1'].unique()],
        value=df_parquet['team1'].unique()[0]
    ),
    dcc.Graph(id='highscore-graph'),
    dcc.Graph(id='team-stats-graph')
])

# Callback to update the highscore graph based on selected team
@app.callback(
    Output('highscore-graph', 'figure'),
    Input('team-dropdown', 'value')
)
def update_highscore_graph(selected_team):
    filtered_df = df_parquet[df_parquet['team1'] == selected_team]

    # Visualization: Create a line chart showing the highscore trend for the selected team
    figure = {
        'data': [
            {'x': filtered_df['date'], 'y': filtered_df['highscore'], 'type': 'line', 'name': 'Highscore'},
        ],
        'layout': {
            'title': f'Highscore for {selected_team}'
        }
    }
    return figure

# Callback to update the team stats graph
@app.callback(
    Output('team-stats-graph', 'figure'),
    Input('team-dropdown', 'value')
)
def update_team_stats_graph(selected_team):
    # Filter the team stats for the selected team
    team_stat = team_stats[team_stats['Team'] == selected_team]

    # Visualization: Create a bar chart showing the average highscore for each team
    figure = {
        'data': [
            {'x': team_stats['Team'], 'y': team_stats['Average Highscore'], 'type': 'bar', 'name': 'Average Highscore'},
        ],
        'layout': {
            'title': 'Average Highscore for Each Team',
            'xaxis': {'title': 'Team'},
            'yaxis': {'title': 'Average Highscore'}
        }
    }
    return figure

if __name__ == '__main__':
    app.run_server(debug=True)


<IPython.core.display.Javascript object>

In [61]:
import pandas as pd
import dash
from dash import dcc
from dash import html
from dash.dependencies import Input, Output

# Load the Parquet data into a pandas DataFrame
parquet_path = "/content/output"
df_parquet = pd.read_parquet(parquet_path)

# Create a Dash app
app = dash.Dash(__name__)

# Layout of the Dash app
app.layout = html.Div([
    html.H1("IPL Match Information"),
    dcc.Dropdown(
        id='match-dropdown',
        options=[{'label': str(match_id), 'value': match_id} for match_id in df_parquet['match_id'].unique()],
        value=df_parquet['match_id'].unique()[0]
    ),
    html.Div(id='match-info')
])

# Callback to display match information
@app.callback(
    Output('match-info', 'children'),
    Input('match-dropdown', 'value')
)
def display_match_info(selected_match_id):
    # Filter the DataFrame for the selected match ID
    match_info = df_parquet[df_parquet['match_id'] == selected_match_id].iloc[0]

    # Create an HTML table to display the match information
    table = html.Table([
        html.Tr([html.Th("Match ID"), html.Td(selected_match_id)]),
        html.Tr([html.Th("Team 1"), html.Td(match_info['team1'])]),
        html.Tr([html.Th("Team 2"), html.Td(match_info['team2'])]),
        html.Tr([html.Th("Date"), html.Td(match_info['date'])]),
        html.Tr([html.Th("Venue"), html.Td(match_info['venue'])]),
        html.Tr([html.Th("Highscore"), html.Td(match_info['highscore'])]),
        # Add more information fields as needed
    ], style={'border-collapse': 'collapse'})

    return table

if __name__ == '__main__':
    app.run_server(debug=True)


<IPython.core.display.Javascript object>

In [62]:
import pandas as pd
import dash
from dash import dcc
from dash import html
from dash.dependencies import Input, Output
import plotly.express as px

# Load the Parquet data into a pandas DataFrame
parquet_path = "/content/output"
df_parquet = pd.read_parquet(parquet_path)

# Create a Dash app
app = dash.Dash(__name__)

# Layout of the Dash app
app.layout = html.Div([
    html.H1("IPL Match Information"),
    dcc.Dropdown(
        id='match-dropdown',
        options=[{'label': str(match_id), 'value': match_id} for match_id in df_parquet['match_id'].unique()],
        value=df_parquet['match_id'].unique()[0]
    ),
    html.Div(id='match-info'),
    dcc.Graph(id='bar-graph')
])

# Callback to display match information and bar graph
@app.callback(
    [Output('match-info', 'children'),
     Output('bar-graph', 'figure')],
    Input('match-dropdown', 'value')
)
def display_match_info(selected_match_id):
    # Filter the DataFrame for the selected match ID
    match_info = df_parquet[df_parquet['match_id'] == selected_match_id].iloc[0]

    # Create an HTML table to display the match information
    table = html.Table([
        html.Tr([html.Th("Match ID"), html.Td(selected_match_id)]),
        html.Tr([html.Th("Team 1"), html.Td(match_info['team1'])]),
        html.Tr([html.Th("Team 2"), html.Td(match_info['team2'])]),
        html.Tr([html.Th("Date"), html.Td(match_info['date'])]),
        html.Tr([html.Th("Venue"), html.Td(match_info['venue'])]),
        html.Tr([html.Th("Highscore"), html.Td(match_info['highscore'])]),
        # Add more information fields as needed
    ], style={'border-collapse': 'collapse'})

    # Create a bar graph to display high scores of both teams
    bar_fig = px.bar(df_parquet[df_parquet['match_id'] == selected_match_id], x='team1', y='highscore',
                     title=f"High Scores of {match_info['team1']} and {match_info['team2']}")

    return table, bar_fig

if __name__ == '__main__':
    app.run_server(debug=True)


<IPython.core.display.Javascript object>