<a href="https://colab.research.google.com/github/mehr64/OnlineLibrarySystem/blob/master/MyProject_MapReduce.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better

In [None]:
#Taking the Actual Number of Recorded Crimes and Analysing the Quarterly Distribution of Crimes in
#Various Police Force Areas Based on Year and Quarter
from pyspark import SparkConf, SparkContext

def parseCrime(line):
    fields = line.split(',')
    area = fields[2]  # Assuming area is in fields[2]
    year = fields[0]  # Assuming year is in fields[0]
    season = fields[1]  # Assuming season is in fields[1]
    try:
        number_of_crimes = int(fields[-1])  # Number of crimes in the last column
    except ValueError:
        number_of_crimes = 0  # Default to 0 in case of conversion error

    return ((area, year, season), number_of_crimes)

if __name__ == "__main__":
    conf = SparkConf().setAppName("CrimeQuarterlyAnalysis")
    sc = SparkContext.getOrCreate(conf=conf)

    lines = sc.textFile("/content/sample_data/cleaned_data.csv")
    crimeData = lines.map(parseCrime)
    crimeCounts = crimeData.reduceByKey(lambda x, y: x + y)

    sortedCrimeCounts = crimeCounts.sortBy(lambda x: (x[0][0], x[0][1], x[0][2]))

    results = sortedCrimeCounts.collect()
    for result in results:
        print(f"Area: {result[0][0]}, Year: {result[0][1]}, Season: {result[0][2]}, Count: {result[1]}")

Area: "London, Year: 2019/20, Season: 1, Count: 2331
Area: "London, Year: 2019/20, Season: 2, Count: 2144
Area: "London, Year: 2019/20, Season: 3, Count: 2339
Area: "London, Year: 2019/20, Season: 4, Count: 1881
Area: "London, Year: 2020/21, Season: 1, Count: 632
Area: "London, Year: 2020/21, Season: 2, Count: 1323
Area: "London, Year: 2020/21, Season: 3, Count: 1205
Area: "London, Year: 2020/21, Season: 4, Count: 662
Area: "London, Year: 2021/22, Season: 1, Count: 1113
Area: "London, Year: 2021/22, Season: 2, Count: 1722
Area: "London, Year: 2021/22, Season: 3, Count: 1869
Area: "London, Year: 2021/22, Season: 4, Count: 1608
Area: "London, Year: 2022/23, Season: 1, Count: 1815
Area: "London, Year: 2022/23, Season: 2, Count: 1946
Area: "London, Year: 2022/23, Season: 3, Count: 1901
Area: "London, Year: 2022/23, Season: 4, Count: 1930
Area: "London, Year: 2023/24, Season: 1, Count: 1521
Area: Avon and Somerset, Year: 2019/20, Season: 1, Count: 35905
Area: Avon and Somerset, Year: 2019/2

In [None]:
#Calculate the total number of crimes reported per year.
def mapYearlyCrimes(line):
    fields = line.split(',')
    year = fields[0]  # Given year is in fields[0]
    try:
        count = int(fields[-1])  # Given count is in the last column
    except ValueError:
        count = 0  # Default to 0 in case of conversion error

    return (year, count)

# Rest of the Spark setup code goes here

crimeData = lines.map(mapYearlyCrimes)
yearlyCrimeCounts = crimeData.reduceByKey(lambda x, y: x + y)

# Code to collect and print results


In [None]:
from pyspark import SparkConf, SparkContext

def parseCrime(line):
    fields = line.split(',')
    return (fields[0], 1)  # Return (Financial Year, 1)

if __name__ == "__main__":
    conf = SparkConf().setAppName("CrimeAnalysis")
    sc = SparkContext.getOrCreate(conf=conf)

    lines = sc.textFile("/content/sample_data/cleaned_data.csv")

    # Convert to (Financial Year, 1)
    crimeData = lines.map(parseCrime)

    # Count the number of crimes in each year
    crimeCounts = crimeData.reduceByKey(lambda x, y: x + y)

    # Results
    results = crimeCounts.collect()
    for result in results:
        print(f"Year: {result[0]}, Number of Crimes: {result[1]}")

Year: Financial Year, Number of Crimes: 1
Year: 2019/20, Number of Crimes: 22882
Year: 2021/22, Number of Crimes: 23220
Year: 2020/21, Number of Crimes: 23220
Year: 2022/23, Number of Crimes: 23220
Year: 2023/24, Number of Crimes: 6235


In [None]:
from pyspark import SparkConf, SparkContext

def parseCrime(line):
    fields = line.split(',')
    year = fields[0]
    offences = int(fields[-1])
    return (year, offences)

if __name__ == "__main__":
    conf = SparkConf().setAppName("CrimeAnalysis")
    sc = SparkContext.getOrCreate(conf=conf)

    lines = sc.textFile("/content/sample_data/cleaned_data.csv")

    # Filter out the header row
    header = lines.first()
    filteredLines = lines.filter(lambda line: line != header)

    # Map and Reduce
    crimeData = filteredLines.map(parseCrime)
    crimeCounts = crimeData.reduceByKey(lambda x, y: x + y)

    # Collect and print results
    results = crimeCounts.collect()
    for result in results:
        print(f"Year: {result[0]}, Total Offences: {result[1]}")


Year: 2019/20, Total Offences: 5233676
Year: 2021/22, Total Offences: 5281533
Year: 2020/21, Total Offences: 4570453
Year: 2022/23, Total Offences: 5516196
Year: 2023/24, Total Offences: 1392345


In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('statistic').getOrCreate()

df = spark.read.csv('/content/sample_data/cleaned_data.csv', header=True, inferSchema=True)

# Describe the DataFrame to get summary statistics
df.describe().show()


+-------+--------------+------------------+-----------------+--------------------+--------------------+--------------------+------------------+------------------+
|summary|Financial Year| Financial Quarter|       Force Name| Offence Description|       Offence Group|    Offence Subgroup|      Offence Code|Number of Offences|
+-------+--------------+------------------+-----------------+--------------------+--------------------+--------------------+------------------+------------------+
|  count|         98777|             98777|            98777|               98777|               98777|               98777|             98777|             98777|
|   mean|          null| 2.405347398685929|             null|                null|                null|                null| 65.97837837837909|222.66522571043868|
| stddev|          null|1.1419964195675791|             null|                null|                null|                null|127.47204537354665| 861.6207697187683|
|    min|       2019/2