# Project

In [1]:
import pandas as pd
import json
import numpy as np


In [2]:
df = pd.read_csv("keystrokes-recipes.csv")
KEYWORDS = ["Shift", "Backspace", "Enter", "ArrowDown",
            "ArrowLeft", "ArrowRight", "ArrowUp", "End", "Control", "CapsLock"]

#create a copy of the dataset to another csv file

df.to_csv('keystrokes-recipes-modified.csv', index=False)

- ```keystrokes-recipes.csv``` is the original data and we keep it in case we want to look back at one moment
- ```keystrokes-recipes-modified.csv``` is the modified data


## Data cleaning and sorting


Our data consists of a csv file with event dates, user ids, keystrokes and the recipes they wrote.
We clean all the data by working throught the keystrokes first.

* We group the characters into the word written and separate between important keywords typed such as backspace, shift, enter etc. The sequence ["shift", "p", "e", "r"] becomes ["shift", "per"] 
* We sort by user id to get a better idea of every recipe every student has written.

## Processing the data


The first thing we did was isolate the keystrokes to a new ```json``` file saved in ```data/all_keystrokes.json```

The next step is to group words together and separate them from keywords and we work between each whitespace.
 
So for example this entry: 

```{'time': 1662252404346, 'character': 'Shift'}, {'time': 1662252404376, 'character': 'f'}, {'time': 1662252404505, 'character': 'i'}``` 

gives the following output: 

```{'time': 1662252404346, 'word': 'Shift'}, {'time': 1662252404505, 'word': 'fi'}```


In [3]:
def find_seq(chars):
    return "".join(list(filter(lambda _ : _ not in KEYWORDS, chars)))

def separate_entry(json_values):
    new_data = []
    last_whitespace = 0
    characters = [arr[1] for arr in json_values]

    for i, (time, character) in enumerate(json_values):
        if character.isspace():
            word = characters[last_whitespace: i]
            if not any(i in word for i in KEYWORDS):
                new_data.append({'time': time, 'word': "".join(word)})
            else:
                new_data.append({'time': time, 'word': find_seq(word)})
            last_whitespace = i+1

        elif character in KEYWORDS:
            new_data.append({'time': time, 'word': character})
        
        elif not character:
            continue

    return new_data

def compute():
    new_df = pd.DataFrame(pd.read_json("data/all_keystrokes.json"))
    arr = []
    for jsonf in new_df.values:
        sub_arr = []
        for d in jsonf:
            if d is not None:
                sub_arr.append([d["time"], d["character"]])
        arr.append(sub_arr)

    result = []
    for jsonf in arr:
        result.append(separate_entry(jsonf))

    import json
    with open("data/new_data.json", "w") as f:
        json.dump(result, f)
    return

compute()



Basically, ``` compute()``` will format the data for the ```separate_entry``` function and when everything is computed, it dumps all the data in a new json file.

```separate_entry``` computes the words between each space character, all the while separating words from keywords. It uses the function ```find_seq``` to separate the characters from keywords so it allows to isolate words between each whitespaces.


## Modifying the CSV file

We just modify the keystroke data for each row of the original data in ```keystrokes-recipes.csv``` but apply it to ```keystrokes-recipes-modified.csv```

In [4]:
jsons = pd.read_json("data/new_data.json").values.tolist()

for i, json in enumerate(jsons):
    jsons[i]= list(filter(lambda _ : _ is not None, json))

def write_to_csv_file(filename, recipes_len):
    df = pd.read_csv(filename)
    for i in range(recipes_len):
        df.loc[i,"ks"] = jsons[i]
    df.to_csv(filename, index=False)


#Uncomment only if you are running the notebook for the first time
write_to_csv_file("keystrokes-recipes-modified.csv", len(jsons))


## Sorting by user id

We want to sort by user id in order to differentiate behaviour between different people more easily

In [5]:
csv_filename = "keystrokes-recipes-modified.csv"

pd.read_csv(csv_filename).sort_values(by="user_id", ascending=True).to_csv(csv_filename, index=False)

In [30]:
from pyspark.ml.fpm import PrefixSpan
from pyspark.shell import sc
from pyspark.sql.functions import desc
from pyspark.sql.types import Row

from csv import reader

recipes = pd.read_csv('keystrokes-recipes.csv')['recipe'][:2]

copy = recipes.copy()
copy[0] = Row(sequence=[recipes[0].replace("\n", "").split(" ")])
copy[1] = Row(sequence=[recipes[1].replace("\n", "").split(" ")])


l = [copy[0], copy[1]]
print(l)

df = sc.parallelize(l).toDF()

prefixSpan = PrefixSpan(minSupport=0.1, maxPatternLength=3, maxLocalProjDBSize=32000000)

# Find frequent sequential patterns.
prefixSpan.findFrequentSequentialPatterns(df).sort(desc("freq")).show(10,False)


[Row(sequence=[['Firstly,', 'cut', 'up', 'some', 'chicken', 'breasts', 'into', 'cubes', 'before', 'then', 'dicing', 'one', 'white', 'onion.Next,', 'pour', 'a', 'drizzle', 'of', 'vegetable', 'oil', 'into', 'the', 'pan', 'before', 'placing', 'the', 'chicken', 'into', 'the', 'pan', 'to', 'brown.', 'Season', 'the', 'chicken,', 'whilst', 'continuously', 'stirring', 'until', 'golden.Mix', 'in', 'the', 'white', 'onion,', 'before', 'then', 'adding', 'a', 'tablespoon', 'of', 'garam', 'masala,', 'cumin', 'and', 'cyanne', 'pepper', 'for', 'spice.', 'Continue', 'to', 'stir', 'this', 'for', 'around', '1', 'minute', 'until', 'fragrant.Stir', 'in', 'some', '300ml', 'of', 'coconut', 'milk,', 'as', 'well', 'as', '300ml', 'of', 'tinned', 'chopped', 'tomatos', 'and', 'season', 'to', 'taste.Simmer', 'this', 'for', 'around', '15', 'minutes', 'until', 'the', 'chicken', 'is', 'cooked', 'through,', 'and', 'it', 'has', 'thickened', 'into', 'an', 'flavourful', 'curry.']]), Row(sequence=[['First', 'dice', 'two',

In [28]:
from pyspark.ml.fpm import PrefixSpan
from pyspark.shell import sc
from pyspark.sql.functions import desc
from pyspark.sql.types import Row

from csv import reader

# open file in read mode
list = []

with open("store_data2.csv", 'r') as read_obj:
    # pass the file object to reader() to get the reader object
    csv_reader = reader(read_obj)
    customerlist=[]
    sequence = []
    anotherList=[]
    for row in csv_reader:# row variable is a list that represents a row in csv  # print(row)

        list2 = []

        for x in row:
              if x.isdigit():
                
                if  len(customerlist) == 0 or  customerlist[len(customerlist) - 1] != x:

                     if len(customerlist) != 0:
                            list.append(Row(sequence=anotherList))

                     customerlist.append(x)
                     sequence=[]
                     anotherList=[]

              if x != "" and not x.isdigit() :

                  list2.append(x)

        anotherList.append(list2)


list.append(Row(sequence=anotherList))
print(list)



#df = sc.parallelize(list).toDF()

#prefixSpan = PrefixSpan(minSupport=0.1, maxPatternLength=3,
 #                       maxLocalProjDBSize=32000000)

# Find frequent sequential patterns.
#prefixSpan.findFrequentSequentialPatterns(df).sort(desc("freq")).show(10,False)

[Row(sequence=[['shrimp', 'almonds', 'avocado', 'vegetables mix'], ['burgers', 'meatballs', 'eggs'], ['chutney'], ['turkey', 'avocado'], ['mineral water', 'milk', 'energy bar', 'whole wheat rice'], ['low fat yogurt']]), Row(sequence=[['whole wheat pasta', 'french fries'], ['soup', 'light cream', 'shallot'], ['frozen vegetables', 'spaghetti', 'green tea'], ['french fries'], ['eggs', 'pet food'], ['cookies']]), Row(sequence=[['turkey', 'burgers', 'mineral water', 'eggs'], ['spaghetti', 'champagne', 'cookies'], ['mineral water', 'salmon'], ['mineral water'], ['shrimp', 'chocolate', 'chicken', 'honey'], ['turkey', 'eggs'], ['turkey', 'fresh tuna', 'tomatoes', 'spaghetti']]), Row(sequence=[['meatballs', 'milk', 'honey', 'french fries'], ['red wine', 'shrimp', 'pasta', 'pepper'], ['rice', 'sparkling water'], ['spaghetti', 'mineral water', 'ham', 'body spray'], ['burgers', 'grated cheese', 'shrimp', 'pasta'], ['eggs']]), Row(sequence=[['parmesan cheese', 'spaghetti', 'soup', 'avocado'], ['gro