#Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
!pip install kaggle
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 36 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 66.9 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=7afc76c31bb9d10275eecf55609c473727fc2d30b3185f59da74d70cc4829527
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os, time
from google.colab import drive, files
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
import re

In [4]:
os.environ["TZ"] = "US/Eastern"
time.tzset()
drive.mount('/content/drive')

Mounted at /content/drive


#Authenticating with Kaggle using kaggle.json
Navigate to https://www.kaggle.com. Then go to the Account tab of your user profile and select Create API Token. This will trigger the download of kaggle.json, a file containing your API credentials.

Then run the cell below to upload kaggle.json to your Colab runtime.

In [5]:
uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  
# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 62 bytes


In [6]:
!kaggle datasets download "rexhaif/emojifydata-en"

Downloading emojifydata-en.zip to /content
 98% 871M/886M [00:06<00:00, 157MB/s]
100% 886M/886M [00:06<00:00, 147MB/s]


In [7]:
!mkdir "kaggle"
!mkdir "kaggle/input"
!unzip emojifydata-en.zip -d "kaggle/input"

Archive:  emojifydata-en.zip
  inflating: kaggle/input/dev.txt    
  inflating: kaggle/input/emojitweets-01-04-2018.txt  
  inflating: kaggle/input/test.txt   
  inflating: kaggle/input/train.txt  


# Cleaning Dataset

In [8]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [9]:
ss = SparkSession.builder \
    .config("spark.driver.memory", "10g") \
    .getOrCreate()
sc = ss.sparkContext
s  = SQLContext(sc)



Read _.txt as RDD. Change to other files

In [10]:
tweets = sc.textFile("./kaggle/input/train.txt")

Convert to python list to concatenate words into tweets

In [11]:
tweets_clean = tweets.flatMap(lambda x: x.split(' ')).filter(lambda x:x!='O').filter(lambda x:x!='').filter(lambda x:x!='<STOP>')
tweet_list = tweets_clean.collect()

In [12]:
tweets = []
global counter
counter = -1

def wordsToTweets(x):
    global counter
    if(x=='<START>'):
        counter+=1
        tweets.append('')
    else:
        tweets[counter]+=(x+" ")

In [13]:
for word in tweet_list:
    wordsToTweets(word)
# conver back to rdd
tweet_rdd = sc.parallelize(tweets)
tweets = tweet_rdd.zipWithIndex().map(lambda x: (x[1],x[0]))
# get emoji out of text
text_rdd = tweets.map(lambda x: (x[0],re.sub(":.*?:","",x[1])))
emoji_rdd = tweets.map(lambda x: (x[0],re.findall(":.*?:",x[1])))
emoji_rdd1 = emoji_rdd.map(lambda x:(x[0],x[1][0]))
max_emoji = emoji_rdd.map(lambda x: len(x[1])).max()
# join text with emoji rdd
for i in range(1,max_emoji):
    emoji_rdd2 = emoji_rdd.filter(lambda x:len(x[1])>i).map(lambda x: (x[0],x[1][i]))
    if i==1:
        emoji_rdd3 = emoji_rdd1.union(emoji_rdd2)
    else:
        emoji_rdd3 = emoji_rdd3.union(emoji_rdd2)

rdd_for_df = text_rdd.leftOuterJoin(emoji_rdd3).map(lambda x:(x[1][0][:-1],x[1][1][1:-1]))
# create dataframe
df = s.createDataFrame(rdd_for_df, ['text','emoji']).distinct()
df.show()

+--------------------+--------------------+
|                text|               emoji|
+--------------------+--------------------+
|So excited to see...|       raising_hands|
|CITY OF DUNDEE CO...|beaming_face_with...|
|A late one or ear...|         right_arrow|
|The Lord always s...|        folded_hands|
|Wouldnt be the pe...|           red_heart|
|IEXOL KEXOL JEXOL...|          heart_suit|
|Could you just be...|  loudly_crying_face|
|Behaviors beat va...|           thumbs_up|
|Classy girl AVAIL...|         female_sign|
|Gotta wait for th...|                eyes|
|   This is a winner |face_with_tears_o...|
|So funny catchin ...|face_with_tears_o...|
|Twitter do ya tha...|    person_shrugging|
|Just ignore them ...|smiling_face_with...|
|thank you taco  ️...|face_with_tears_o...|
|THATS OUR ROOK BA...|                fire|
|that part of the ...|          two_hearts|
|I feel like youre...|     sparkling_heart|
|Still tryna figur...|face_with_tears_o...|
| They are so cute  …|  loudly_c

In [14]:
df.write.csv('train')

In [16]:
!zip -r train.zip train

  adding: train/ (stored 0%)
  adding: train/.part-00007-45f36769-3c72-4e5b-a517-082707e8869b-c000.csv.crc (deflated 0%)
  adding: train/train2.csv (deflated 62%)
  adding: train/train4.csv (deflated 62%)
  adding: train/.part-00004-45f36769-3c72-4e5b-a517-082707e8869b-c000.csv.crc (deflated 0%)
  adding: train/train1.csv (deflated 62%)
  adding: train/.part-00001-45f36769-3c72-4e5b-a517-082707e8869b-c000.csv.crc (deflated 0%)
  adding: train/._SUCCESS.crc (stored 0%)
  adding: train/.part-00002-45f36769-3c72-4e5b-a517-082707e8869b-c000.csv.crc (deflated 0%)
  adding: train/.part-00000-45f36769-3c72-4e5b-a517-082707e8869b-c000.csv.crc (deflated 0%)
  adding: train/.part-00006-45f36769-3c72-4e5b-a517-082707e8869b-c000.csv.crc (deflated 0%)
  adding: train/train7.csv (deflated 62%)
  adding: train/.ipynb_checkpoints/ (stored 0%)
  adding: train/train3.csv (deflated 62%)
  adding: train/.part-00005-45f36769-3c72-4e5b-a517-082707e8869b-c000.csv.crc (deflated 0%)
  adding: train/.part-00003

In [17]:
!mv ./train.zip ./drive/Shareddrives/EECS\ 545/