In [1]:
# Mount the Google Drve for getting dataset

from google.colab import drive
drive.mount('/content/drive')

# Change directory to the project directory

import os
os.chdir('/content/drive/MyDrive/turkish-tsa/')

Mounted at /content/drive


In [2]:
import csv, random

# Change the seed for different splits:
random.seed(2)

In [3]:
def getRatios(data):
    for key in data:
        print(key)
        nofTrain = 0
        nofTest = 0
        nofVal = 0
        for item in data[key]:
            if item in trainList:
                nofTrain += 1
            elif item in testList:
                nofTest += 1
            else:
                nofVal += 1
        nofAllData = len(data[key])
        print("\tNumber of data: %d" % nofAllData)
        print("\tTrain ratio: %f" % (nofTrain/nofAllData))
        print("\tTest ratio: %f" % (nofTest/nofAllData))
        print("\tValidation ratio: %f\n" % (nofVal/nofAllData))

In [4]:
trainPercent = 0.65
testPercent = 0.2

In [5]:
targets = ["trendyol", "whatsapp", "ekşi sözlük", "coca cola", "turkcell", "hepsiburada"]
tSentiments = ["neutral", "negative", "positive"]
sSentiments = ["neutral", "negative", "positive"]

In [6]:
labelInfo = {}
data = {}

for target in targets:
    for tSentiment in tSentiments:
        for sSentiment in sSentiments:
            labelInfo[(tSentiment,sSentiment,target)] = []

In [7]:
with open("./dataset/raw_dataset/raw_dataset.csv") as f:
    csvReader = csv.reader(f, delimiter=',')
    next(csvReader)
    
    for values in csvReader:
        text = values[0]
        tweetUrl = values[1]
        tSentiment = values[2]
        sSentiment = values[3]
        target = values[4]
        labelInfo[(tSentiment, sSentiment, target)].append(text)
        data[text] = [tweetUrl, tSentiment, sSentiment, target]

In [8]:
nofData = len(data.keys())
trainList = []
testList = []
validationList = []

different_s_t_list = [] # Data whose sentence and targeted sentiments are different

same_s_t_list = [] # Data whose sentence and targeted sentiments are different

both_positive_list = [] # Data whose sentence and targeted sentiments are both positive

both_negative_list = [] # Data whose sentence and targeted sentiments are both negative

both_neutral_list = [] # Data whose sentence and targeted sentiments are both neutral

ts_positive_list = [] # Data whose targeted sentiment is positive

ts_negative_list = [] # Data whose targeted sentiment is negative

ts_neutral_list = [] # Data whose targeted sentiment is neutral

In [9]:
brandData = {}
for brand in targets:
    brandData[brand] = []

tSentimentData = {}
for ts in tSentiments:
    tSentimentData[ts] = []

sSentimentData = {}
for ss in sSentiments:
    sSentimentData[ss] = []

In [10]:
for key in labelInfo:
    random.shuffle(labelInfo[key])

    for item in labelInfo[key]:

        ts,ss,brand = key
        brandData[brand].append(item)
        tSentimentData[ts].append(item)
        sSentimentData[ss].append(item)

        prob = random.random()
        if prob <= trainPercent:
            trainList.append(item)
        elif prob <= trainPercent + testPercent :
            testList.append(item)
        else:
            validationList.append(item)

        if ts != ss:
            different_s_t_list.append(item)

        elif ts == ss:
            same_s_t_list.append(item) 

        if ts == "positive":
            ts_positive_list.append(item)
            if ss == "positive":
                both_positive_list.append(item)   

        elif ts == "neutral":
            ts_neutral_list.append(item)
            if ss == "neutral":
                both_neutral_list.append(item)    

        elif ts == "negative":
            ts_negative_list.append(item)
            if ss == "negative":
                both_negative_list.append(item)   

evaluation_lists = [different_s_t_list, same_s_t_list, ts_positive_list, \
                    ts_neutral_list, ts_negative_list, both_positive_list, \
                    both_neutral_list, both_negative_list]

In [11]:
random.shuffle(trainList)
random.shuffle(testList)
random.shuffle(validationList)

for lst in evaluation_lists:
    random.shuffle(lst)

In [12]:
print("\n######################\n")
print("All Data:")
print("\tTrain ratio: %f" % (len(trainList)/nofData))
print("\tTest ratio: %f" % (len(testList)/nofData))
print("\tValidation ratio: %f" % (len(validationList)/nofData))


######################

All Data:
	Train ratio: 0.642711
	Test ratio: 0.202162
	Validation ratio: 0.160093


In [13]:
for d,l in [("train",trainList),("test",testList),("validation",validationList)]:
    with open("./dataset/raw_dataset/raw_%s_dataset.csv" % d, "w") as f:
        csvWriter = csv.writer(f)
        csvWriter.writerow(["Text","Tweet URL", "Targeted Sentiment", "Sentence Sentiment", "Target"])
        for text in l:
            row = [text] + list(data[text])
            csvWriter.writerow(row)

    with open("./turkish-tsa-public/dataset/%s_dataset.csv" % d, "w") as f:
        csvWriter = csv.writer(f)
        csvWriter.writerow(["Tweet URL", "Targeted Sentiment", "Sentence Sentiment", "Target"])
        for text in l:
            row = list(data[text])
            csvWriter.writerow(row)

    with open("./dataset/cls_dataset/cls_%s_dataset.csv" % d, "w") as f:
        csvWriter = csv.writer(f)
        csvWriter.writerow(["Text","Tweet URL", "Targeted Sentiment", "Sentence Sentiment", "Target"])
        for text in l:
            clsText = "[CLS] " + text    
            row = [clsText] + list(data[text])
            csvWriter.writerow(row)

    with open("./dataset/tar_dataset/tar_%s_dataset.csv" % d, "w") as f:
        csvWriter = csv.writer(f)
        csvWriter.writerow(["Text","Tweet URL", "Targeted Sentiment", "Sentence Sentiment", "Target"])
        for text in l:
            target = data[text][3]
            parts = text.split(target)
            tarText = "[CLS] " + parts[0] + " [TAR] " + target + " [TAR] " + parts[1]  
            row = [tarText] + list(data[text])
            csvWriter.writerow(row)

    with open("./dataset/u_dataset/u_%s_dataset.csv" % d, "w") as f:
        csvWriter = csv.writer(f)
        csvWriter.writerow(["Text","Tweet URL", "Targeted Sentiment", "Sentence Sentiment", "Target"])
        for text in l:
            target = data[text][3]
            parts = text.split(target)
            tarText = "[CLS] " + parts[0] + " [unused0] " + target + " [unused0] " + parts[1]  
            row = [tarText] + list(data[text])
            csvWriter.writerow(row)

In [14]:
for d,l in [("diff_s_t", different_s_t_list),("same_s_t", same_s_t_list),\
            ("positive_t", ts_positive_list),("negative_t",ts_negative_list),\
            ("neutral_t",ts_neutral_list),("positive_s_t",both_positive_list),\
            ("negative_s_t",both_negative_list),("neutral_s_t",both_neutral_list)]:
    with open("./dataset/raw_dataset/raw_%s.csv" % d, "w") as f:
        csvWriter = csv.writer(f)
        csvWriter.writerow(["Text","Tweet URL", "Targeted Sentiment", "Sentence Sentiment", "Target"])
        for text in l:
            row = [text] + list(data[text])
            csvWriter.writerow(row)

    with open("./dataset/cls_dataset/cls_%s.csv" % d, "w") as f:
        csvWriter = csv.writer(f)
        csvWriter.writerow(["Text","Tweet URL", "Targeted Sentiment", "Sentence Sentiment", "Target"])
        for text in l:
            clsText = "[CLS] " + text    
            row = [clsText] + list(data[text])
            csvWriter.writerow(row)

    with open("./dataset/tar_dataset/tar_%s.csv" % d, "w") as f:
        csvWriter = csv.writer(f)
        csvWriter.writerow(["Text","Tweet URL", "Targeted Sentiment", "Sentence Sentiment", "Target"])
        for text in l:
            target = data[text][3]
            parts = text.split(target)
            tarText = "[CLS] " + parts[0] + " [TAR] " + target + " [TAR] " + parts[1]  
            row = [tarText] + list(data[text])
            csvWriter.writerow(row)

    with open("./dataset/u_dataset/u_%s.csv" % d, "w") as f:
        csvWriter = csv.writer(f)
        csvWriter.writerow(["Text","Tweet URL", "Targeted Sentiment", "Sentence Sentiment", "Target"])
        for text in l:
            target = data[text][3]
            parts = text.split(target)
            tarText = "[CLS] " + parts[0] + " [unused0] " + target + " [unused0] " + parts[1]  
            row = [tarText] + list(data[text])
            csvWriter.writerow(row)