In [1]:
import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

import string
import numpy as np

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master('local[18]') \
    .config("spark.driver.memory", "15g") \
    .config("spark.sql.repl.eagerEval.enabled", True) \
    .getOrCreate()

### Load data

In [3]:
schema = StructType([
    StructField("bigram", StringType(), True),
    StructField("year", IntegerType(), True),
    StructField("count", IntegerType(), True),
    StructField("bigram_percent", DoubleType(), True),
    StructField("medianA",  DoubleType(), True),
    StructField("ntileA", IntegerType(), True),
    StructField("medianB",  DoubleType(), True),
    StructField("ntileB", IntegerType(), True),
    StructField("label", IntegerType(), True),
    StructField("start_of_mainstreaming", IntegerType(), True),
    StructField("end_of_mainstreaming", IntegerType(), True),
    StructField("peak_year", IntegerType(), True)])

In [13]:
df = spark.read.option('header', False).schema(schema)\
                     .csv('/data/shared1/cleandata/changepoint_annotated_allntiles_newer')

### Divide into train/test and classification dataframe

In [5]:
# generate random year for each bigram
randomyear_df = df.groupBy('bigram').agg(F.sum('count')).alias('count_sum')\
                  .withColumn('random_year', (round(F.rand() * 90) + 1930))

In [6]:
# add random_year column to original dataframe
df = df.join(randomyear_df, ['bigram'], how = 'inner')

In [7]:
df = df.drop('count', 'sum(count)')

In [8]:
df.printSchema()

root
 |-- bigram: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- bigram_percent: double (nullable = true)
 |-- medianA: double (nullable = true)
 |-- ntileA: integer (nullable = true)
 |-- medianB: double (nullable = true)
 |-- ntileB: integer (nullable = true)
 |-- label: integer (nullable = true)
 |-- start_of_mainstreaming: integer (nullable = true)
 |-- end_of_mainstreaming: integer (nullable = true)
 |-- peak_year: integer (nullable = true)
 |-- random_year: double (nullable = true)



In [9]:
# run to get full dataset
traintest_ngrams, classify_ngrams = df.select('bigram').distinct().randomSplit([0.8, 0.2], seed=4)
traintest_df = df.join(traintest_ngrams, ['bigram'], how='inner')
classify_df = df.join(classify_ngrams, ['bigram'], how='inner')

In [9]:
# run to get small dataset
sample = df.select('bigram').distinct().sample(withReplacement=False, fraction=0.05, seed=4)
traintest_ngrams, classify_ngrams = sample.randomSplit([0.8, 0.2], seed=4)
traintest_df = df.join(traintest_ngrams, ['bigram'], how='inner')
classify_df = df.join(classify_ngrams, ['bigram'], how='inner')

### save dataframes

In [10]:
traintest_df.write.csv('/data/shared1/cleandata/traintest_sample_5percent')

In [11]:
classify_df.write.csv('/data/shared1/cleandata/classify_set_newer')