In [1]:
import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, BooleanType

In [2]:
import string
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master('local[9]') \
    .config("spark.driver.memory", "15g") \
    .config("spark.sql.repl.eagerEval.enabled", True) \
    .getOrCreate()

In [4]:
schema = StructType([
    StructField("bigram", StringType(), True),
    StructField("year", IntegerType(), True),
    StructField("count", IntegerType(), True),
    StructField("bigram_percent", FloatType(), True),
    StructField("bigram_percent_smooth", FloatType(), True)])

In [5]:
df1 = spark.read.option('header', False).schema(schema)\
                     .csv('/data/shared1/cleandata/lowercase1_clean')

In [6]:
df2 = spark.read.option('header', False).schema(schema)\
                     .csv('/data/shared1/cleandata/lowercase2_clean')

In [7]:
df3 = spark.read.option('header', False).schema(schema)\
                     .csv('/data/shared1/cleandata/uppercase_clean')

In [8]:
uniondf = df1.union(df2)
uniondf = uniondf.union(df3)

In [10]:
uniondf.select(F.countDistinct('bigram')).show()

+----------------------+
|count(DISTINCT bigram)|
+----------------------+
|                205801|
+----------------------+



In [9]:
cleandf = uniondf.groupBy('bigram', 'year').agg(F.sum('count').alias('count'), 
                                                F.sum('bigram_percent').alias('bigram_percent'),
                                                F.sum('bigram_percent_smooth').alias('bigram_percent_smooth'))
cleandf = cleandf.sort(asc('year'))

In [10]:
cleandf.repartition('bigram').sortWithinPartitions('bigram', 'year').\
                              write.csv('/data/shared1/cleandata/clean_allfiles')