In [None]:
import json
import os
import pandas as pd
import re

# Connect to my Drive

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
# Change the path as required
path = "/content/drive/MyDrive/11785-project"
! cd $path

In [None]:
train_json_path = ["fiqa2018/FiQA_ABSA_task1/task1_headline_ABSA_train.json", "fiqa2018/FiQA_ABSA_task1/task1_post_ABSA_train.json"]
fiqaPath = os.path.join(path, "fiqa2018/FiQA_ABSA_task1/task1_headline_ABSA_train.json")
fiqaPath = os.path.join(path, "fiqa2018/FiQA_ABSA_task1/task1_post_ABSA_train.json")
with open(fiqaPath) as f:
    data = json.load(f)
    print(len(data))

675


# Custom DataExtractor

In [None]:
class DataExtractor:

    def __init__(self, path, partition="train"):
        self.path = path
        self.fname = path.split('/')[-1]
        self.partition = partition
        if partition == "train":
            self.columns = ["id", "sentence", "snippets", "target", "sentiment_score", "aspects"]
        else:
            self.columns = ["id", "sentence", "snippets", "target"]

    def extract_data(self):
        path = self.path
        partition = self.partition

        with open(path) as f:
            json_data = f.read()
            raw_data = json.loads(json_data)

        data = []
        for id in raw_data:
            sentence = raw_data[id]["sentence"]
            all_info = raw_data[id]["info"]
            for info in all_info:
                row = []
                snippets = info["snippets"][2:-2]
                target = info["target"]
                row.extend([id, sentence, snippets, target])

                if partition == "train":
                    sentiment_score = float(info["sentiment_score"])
                    aspects = info["aspects"][2:-2]
                    row.extend([sentiment_score, aspects])

            data.append(row)

        return data

    def convert_to_df(self, data):
        columns = self.columns
        df = pd.DataFrame(data, columns=columns)
        return df

    def clean_df(self, df):
        df['sentence'] = df['sentence'].apply(lambda x: re.sub(r'http\S+', '', x))
        df['snippets'] = df['snippets'].apply(lambda x: re.sub(r'http\S+', '', x))
        return df

    def write_df(self, df, path = None):
        if path is None:
            path = self.path.replace(".json", ".csv")
        df.to_csv(path)

In [None]:
train_path1 = os.path.join(path, "fiqa2018/FiQA_ABSA_task1/task1_headline_ABSA_train.json")
de1 = DataExtractor(train_path1)
data1 = de1.extract_data()
df1 = de1.convert_to_df(data1)
df1 = de1.clean_df(df1)
de1.write_df(df1)

In [None]:
train_path2 = os.path.join(path, "fiqa2018/FiQA_ABSA_task1/task1_post_ABSA_train.json")
de2 = DataExtractor(train_path2)
data2 = de2.extract_data()
df2 = de2.convert_to_df(data2)
df2 = de1.clean_df(df2)
de2.write_df(df2)

In [None]:
test_path1 = os.path.join(path, "fiqa2018/FIQA_ABSA_task1_test/task1_post_ABSA_test.json")
de3 = DataExtractor(test_path1, "test")
data3 = de3.extract_data()
df3 = de3.convert_to_df(data3)
df3 = de1.clean_df(df3)
de3.write_df(df3)

In [None]:
test_path2 = os.path.join(path, "fiqa2018/FIQA_ABSA_task1_test/task1_headline_ABSA_test.json")
de4 = DataExtractor(test_path2, "test")
data4 = de4.extract_data()
df4 = de4.convert_to_df(data4)
df4 = de1.clean_df(df4)
de4.write_df(df4)

In [None]:
df1.head()

Unnamed: 0,id,sentence,snippets,sentiment_score,target,aspects
0,1,Royal Mail chairman Donald Brydon set to step ...,set to step down,Royal Mail,-0.374,Corporate/Appointment
1,7,Stakes High for AstraZeneca Heart Drug Facing ...,Facing Tough Competition,AstraZeneca,-0.24,Corporate/Risks
2,8,UPDATE 1-Dairy Crest loses a third of Morrison...,Crest loses a third of Morrisons milk contract,Morrisons,-0.161,Corporate/Sales/Failed Contract Discussion
3,22,Insight hires Aviva's David Hillier for multi-...,hires Aviva's David Hillier for multi-asset team,Insight,0.137,Corporate/Appointment/Executive Appointment
4,30,Primark racks up a happy Christmas after stron...,after strong sales,Primark,0.704,Corporate/Sales


In [None]:
df2.head()

Unnamed: 0,id,sentence,snippets,sentiment_score,target,aspects
0,14860,Slowly adding some $FIO here but gotta be care...,Slowly adding some $FIO here but gotta be careful,FIO,0.459,Stock/Price Action/Bullish/Bull Position
1,14864,$TRX Long setup. MACD cross.,Long setup. MACD cross.,TRX,0.438,Stock/Technical Analysis
2,14867,I am not optimistic about $amzn both fundement...,both fundementals and charts look like poopoo ...,AMZN,-0.506,Stock/Price Action/Bearish
3,14875,$GRPN might be selling off ahead of $P earning...,might be selling off ahead,P,-0.202,Stock/Price Action/Bearish/Bearish Behavior
4,14876,$IACI Looks good on the weekly chart.,Looks good on the weekly chart.,IACI,0.379,Stock/Technical Analysis


In [None]:
df3.head()

Unnamed: 0,id,sentence,snippets,target
0,19167$GILD,$GILD Back in Bullish,Back in,GILD
1,19168$LE,$LE Back in for a swing on this one @ 15.35/sh...,Back in for a swing,LE
2,19169$WLL,$WLL In for a quarter position @ 8.10...,In for a quarter position,WLL
3,19170$VNR,"$VNR Just added 2,300 @ .77 Going in for long ...","Just added 2,300",VNR
4,19171$TSLA,"Bought back into $TSLA, previously sold at $362",Bought,TSLA


In [None]:
df4.head()

Unnamed: 0,id,sentence,snippets,target
0,0_Cuadrilla,Cuadrilla files to delay application to frack ...,files to delay application,Cuadrilla
1,1001_Sainsbury,Sainsbury chief warns of squeeze on high stree...,warns of squeeze on high street retailers,Sainsbury
2,1006_Barclays,Barclays fined for anti-money-laundering failings,fined for anti-money-laundering failings,Barclays
3,1007_Barclays,UPDATE 3-Barclays fined for lax crime checks i...,fined for lax crime checks in,Barclays
4,1014_GSK,GSK aims to file up to 20 new drugs for approv...,file up to 20 new drugs for approval by 2020,GSK
