In [None]:
# Import libraries
 
import transformers as ts
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from torch.optim import AdamW
import torch.nn as nn
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

In [None]:
def set_seed(seed = 42):  
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

In [None]:
# Function to classify using the self-trained model

def ZS_self(data, batch_size = 5):

    set_seed()

    tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
    model=AutoModelForSequenceClassification.from_pretrained(r'C:\Tempor\reddit\Classification_Model', num_labels=2)

    dati = Dataset.from_pandas(data[['text']])   # pandas
    try:
        dati = dati.remove_columns('__index_level_0__')   # colpa (a volte) di pandas
    except:
        pass

    def tokenize_function(examples):
        return tokenizer(examples["text"], padding=True, max_length=512, truncation=True, return_tensors="pt")
    
    tokenized_text = dati.map(tokenize_function, batched=True)
    tokenized_text = tokenized_text.remove_columns("text")
    tokenized_text.set_format("torch")
    text_loader = DataLoader(tokenized_text, batch_size=batch_size, num_workers=0, shuffle=False)
    
    num_test_steps = len(text_loader)
    tqdm._instances.clear()
    progress_bar = tqdm(range(num_test_steps))

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

    preds = []
    model.eval()
    for batch in text_loader:
        # labels = batch.pop('labels')
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        preds.append(predictions.cpu().numpy())
        progress_bar.update(1)
    preds = np.hstack(preds)
    
    return preds


******DATA UPLOAD AND CLASSIFICATION******

In [None]:
data_pandas=pd.read_json(r"C:\Tempor\Reddit\sentiment_data\amazon\2016_3_10.json", lines=True)
data_pandas_g=pd.read_json(r"C:\Tempor\Reddit\sentiment_data\google\2016_3_10.json", lines=True)

data_pandas=data_pandas.rename(columns={"body": "text"})
data_pandas_g=data_pandas_g.rename(columns={"body": "text"})


predictions=ZS_self(data_pandas)

predictions_g=ZS_self(data_pandas_g)

#Positive: 0, Negative: 1

p=pd.DataFrame(predictions.tolist(), columns=['Sentiment'])
final_data=data_pandas.assign(Sentiment=p.values)
final_data.to_json(r"C:\Tempor\Reddit\sentiment_data\amazon_a\2016_3_10.json")

p_g=pd.DataFrame(predictions_g.tolist(), columns=['Sentiment'])
final_data_g=data_pandas_g.assign(Sentiment=p_g.values)
final_data_g.to_json(r"C:\Tempor\Reddit\sentiment_data\google_a\2016_3_10.json")

# data=pd.read_json(r"C:\Tempor\Reddit\sentiment_data\google\2015_4_18.json")
# data

Map:   0%|          | 0/690 [00:00<?, ? examples/s]

100%|██████████| 138/138 [05:22<00:00,  2.34s/it]


Map:   0%|          | 0/933 [00:00<?, ? examples/s]

100%|██████████| 187/187 [07:31<00:00,  2.42s/it]


In [None]:
data_pandas=pd.read_json(r"C:\Tempor\Reddit\sentiment_data\amazon\2016_3_9.json", lines=True)
data_pandas_g=pd.read_json(r"C:\Tempor\Reddit\sentiment_data\google\2016_3_9.json", lines=True)

data_pandas=data_pandas.rename(columns={"body": "text"})
data_pandas_g=data_pandas_g.rename(columns={"body": "text"})

# data_pandas_g

predictions=ZS_self(data_pandas)

predictions_g=ZS_self(data_pandas_g)

#Positive: 0, Negative: 1

# len(predictions_g)

p=pd.DataFrame(predictions.tolist(), columns=['Sentiment'])
final_data=data_pandas.assign(Sentiment=p.values)
final_data.to_json(r"C:\Tempor\Reddit\sentiment_data\amazon_a\2016_3_9.json")

p_g=pd.DataFrame(predictions_g.tolist(), columns=['Sentiment'])
final_data_g=data_pandas_g.assign(Sentiment=p_g.values)
final_data_g.to_json(r"C:\Tempor\Reddit\sentiment_data\google_a\2016_3_9.json")

# data=pd.read_json(r"C:\Tempor\Reddit\sentiment_data\google\2015_4_18.json")
# data

Map:   0%|          | 0/723 [00:00<?, ? examples/s]

100%|██████████| 145/145 [05:53<00:00,  2.44s/it]


Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

100%|██████████| 205/205 [08:17<00:00,  2.43s/it]


In [None]:
data_pandas=pd.read_json(r"C:\Tempor\Reddit\sentiment_data\amazon\2016_6_26.json", lines=True)
data_pandas_g=pd.read_json(r"C:\Tempor\Reddit\sentiment_data\google\2016_6_26.json", lines=True)

data_pandas=data_pandas.rename(columns={"body": "text"})
data_pandas_g=data_pandas_g.rename(columns={"body": "text"})

# data_pandas_g

predictions=ZS_self(data_pandas)

predictions_g=ZS_self(data_pandas_g)

#Positive: 0, Negative: 1

# len(predictions_g)

p=pd.DataFrame(predictions.tolist(), columns=['Sentiment'])
final_data=data_pandas.assign(Sentiment=p.values)
final_data.to_json(r"C:\Tempor\Reddit\sentiment_data\amazon_a\2016_6_26.json")

p_g=pd.DataFrame(predictions_g.tolist(), columns=['Sentiment'])
final_data_g=data_pandas_g.assign(Sentiment=p_g.values)
final_data_g.to_json(r"C:\Tempor\Reddit\sentiment_data\google_a\2016_6_26.json")

# data=pd.read_json(r"C:\Tempor\Reddit\sentiment_data\google\2015_4_18.json")
# data

Map:   0%|          | 0/464 [00:00<?, ? examples/s]

100%|██████████| 93/93 [03:36<00:00,  2.33s/it]


Map:   0%|          | 0/769 [00:00<?, ? examples/s]

100%|██████████| 154/154 [06:10<00:00,  2.41s/it]


In [None]:
data_pandas=pd.read_json(r"C:\Tempor\Reddit\sentiment_data\amazon\2016_6_25.json", lines=True)
data_pandas_g=pd.read_json(r"C:\Tempor\Reddit\sentiment_data\google\2016_6_25.json", lines=True)

data_pandas=data_pandas.rename(columns={"body": "text"})
data_pandas_g=data_pandas_g.rename(columns={"body": "text"})

# data_pandas_g

predictions=ZS_self(data_pandas)

predictions_g=ZS_self(data_pandas_g)

#Positive: 0, Negative: 1

# len(predictions_g)

p=pd.DataFrame(predictions.tolist(), columns=['Sentiment'])
final_data=data_pandas.assign(Sentiment=p.values)
final_data.to_json(r"C:\Tempor\Reddit\sentiment_data\amazon_a\2016_6_25.json")

p_g=pd.DataFrame(predictions_g.tolist(), columns=['Sentiment'])
final_data_g=data_pandas_g.assign(Sentiment=p_g.values)
final_data_g.to_json(r"C:\Tempor\Reddit\sentiment_data\google_a\2016_6_25.json")

# data=pd.read_json(r"C:\Tempor\Reddit\sentiment_data\google\2015_4_18.json")
# data

Map:   0%|          | 0/791 [00:00<?, ? examples/s]

100%|██████████| 159/159 [06:46<00:00,  2.56s/it]


Map:   0%|          | 0/1079 [00:00<?, ? examples/s]

100%|██████████| 216/216 [09:14<00:00,  2.57s/it]


In [None]:
data_pandas=pd.read_json(r"C:\Tempor\Reddit\sentiment_data\amazon\2016_6_24.json", lines=True)
data_pandas_g=pd.read_json(r"C:\Tempor\Reddit\sentiment_data\google\2016_6_24.json", lines=True)

data_pandas=data_pandas.rename(columns={"body": "text"})
data_pandas_g=data_pandas_g.rename(columns={"body": "text"})

# data_pandas_g

predictions=ZS_self(data_pandas)

predictions_g=ZS_self(data_pandas_g)

#Positive: 0, Negative: 1

# len(predictions_g)

p=pd.DataFrame(predictions.tolist(), columns=['Sentiment'])
final_data=data_pandas.assign(Sentiment=p.values)
final_data.to_json(r"C:\Tempor\Reddit\sentiment_data\amazon_a\2016_6_24.json")

p_g=pd.DataFrame(predictions_g.tolist(), columns=['Sentiment'])
final_data_g=data_pandas_g.assign(Sentiment=p_g.values)
final_data_g.to_json(r"C:\Tempor\Reddit\sentiment_data\google_a\2016_6_24.json")

# data=pd.read_json(r"C:\Tempor\Reddit\sentiment_data\google\2015_4_18.json")
# data

Map:   0%|          | 0/796 [00:00<?, ? examples/s]

100%|██████████| 160/160 [06:24<00:00,  2.40s/it]


Map:   0%|          | 0/1077 [00:00<?, ? examples/s]

100%|██████████| 216/216 [08:40<00:00,  2.41s/it]


In [None]:
data_pandas=pd.read_json(r"C:\Tempor\Reddit\sentiment_data\amazon\2016_6_23.json", lines=True)
data_pandas_g=pd.read_json(r"C:\Tempor\Reddit\sentiment_data\google\2016_6_23.json", lines=True)

data_pandas=data_pandas.rename(columns={"body": "text"})
data_pandas_g=data_pandas_g.rename(columns={"body": "text"})

# data_pandas_g

predictions=ZS_self(data_pandas)

predictions_g=ZS_self(data_pandas_g)

#Positive: 0, Negative: 1

# len(predictions_g)

p=pd.DataFrame(predictions.tolist(), columns=['Sentiment'])
final_data=data_pandas.assign(Sentiment=p.values)
final_data.to_json(r"C:\Tempor\Reddit\sentiment_data\amazon_a\2016_6_23.json")

p_g=pd.DataFrame(predictions_g.tolist(), columns=['Sentiment'])
final_data_g=data_pandas_g.assign(Sentiment=p_g.values)
final_data_g.to_json(r"C:\Tempor\Reddit\sentiment_data\google_a\2016_6_23.json")

# data=pd.read_json(r"C:\Tempor\Reddit\sentiment_data\google\2015_4_18.json")
# data

Map:   0%|          | 0/802 [00:00<?, ? examples/s]

100%|██████████| 161/161 [06:26<00:00,  2.40s/it]


Map:   0%|          | 0/1102 [00:00<?, ? examples/s]

100%|██████████| 221/221 [08:51<00:00,  2.41s/it]


In [None]:
data_pandas=pd.read_json(r"C:\Tempor\Reddit\sentiment_data\amazon\2016_6_22.json", lines=True)
data_pandas_g=pd.read_json(r"C:\Tempor\Reddit\sentiment_data\google\2016_6_22.json", lines=True)

data_pandas=data_pandas.rename(columns={"body": "text"})
data_pandas_g=data_pandas_g.rename(columns={"body": "text"})

# data_pandas_g

predictions=ZS_self(data_pandas)

predictions_g=ZS_self(data_pandas_g)

#Positive: 0, Negative: 1

# len(predictions_g)

p=pd.DataFrame(predictions.tolist(), columns=['Sentiment'])
final_data=data_pandas.assign(Sentiment=p.values)
final_data.to_json(r"C:\Tempor\Reddit\sentiment_data\amazon_a\2016_6_22.json")

p_g=pd.DataFrame(predictions_g.tolist(), columns=['Sentiment'])
final_data_g=data_pandas_g.assign(Sentiment=p_g.values)
final_data_g.to_json(r"C:\Tempor\Reddit\sentiment_data\google_a\2016_6_22.json")

# data=pd.read_json(r"C:\Tempor\Reddit\sentiment_data\google\2015_4_18.json")
# data

Map:   0%|          | 0/806 [00:00<?, ? examples/s]

100%|██████████| 162/162 [06:29<00:00,  2.40s/it]


Map:   0%|          | 0/1116 [00:00<?, ? examples/s]

100%|██████████| 224/224 [08:50<00:00,  2.37s/it]


In [None]:
data_pandas=pd.read_json(r"C:\Tempor\Reddit\sentiment_data\amazon\2016_5_21.json", lines=True)
data_pandas_g=pd.read_json(r"C:\Tempor\Reddit\sentiment_data\google\2016_5_21.json", lines=True)

data_pandas=data_pandas.rename(columns={"body": "text"})
data_pandas_g=data_pandas_g.rename(columns={"body": "text"})

# data_pandas_g

predictions=ZS_self(data_pandas)

predictions_g=ZS_self(data_pandas_g)

#Positive: 0, Negative: 1

# len(predictions_g)

p=pd.DataFrame(predictions.tolist(), columns=['Sentiment'])
final_data=data_pandas.assign(Sentiment=p.values)
final_data.to_json(r"C:\Tempor\Reddit\sentiment_data\amazon_a\2016_5_21.json")

p_g=pd.DataFrame(predictions_g.tolist(), columns=['Sentiment'])
final_data_g=data_pandas_g.assign(Sentiment=p_g.values)
final_data_g.to_json(r"C:\Tempor\Reddit\sentiment_data\google_a\2016_5_21.json")

# data=pd.read_json(r"C:\Tempor\Reddit\sentiment_data\google\2015_4_18.json")
# data

Map:   0%|          | 0/892 [00:00<?, ? examples/s]

100%|██████████| 179/179 [07:11<00:00,  2.41s/it]


Map:   0%|          | 0/1158 [00:00<?, ? examples/s]

100%|██████████| 232/232 [09:09<00:00,  2.37s/it]


In [None]:
data_pandas=pd.read_json(r"C:\Tempor\Reddit\sentiment_data\amazon\2016_5_20.json", lines=True)
data_pandas_g=pd.read_json(r"C:\Tempor\Reddit\sentiment_data\google\2016_5_20.json", lines=True)

data_pandas=data_pandas.rename(columns={"body": "text"})
data_pandas_g=data_pandas_g.rename(columns={"body": "text"})

# data_pandas_g

predictions=ZS_self(data_pandas)

predictions_g=ZS_self(data_pandas_g)

#Positive: 0, Negative: 1

# len(predictions_g)

p=pd.DataFrame(predictions.tolist(), columns=['Sentiment'])
final_data=data_pandas.assign(Sentiment=p.values)
final_data.to_json(r"C:\Tempor\Reddit\sentiment_data\amazon_a\2016_5_20.json")

p_g=pd.DataFrame(predictions_g.tolist(), columns=['Sentiment'])
final_data_g=data_pandas_g.assign(Sentiment=p_g.values)
final_data_g.to_json(r"C:\Tempor\Reddit\sentiment_data\google_a\2016_5_20.json")

# data=pd.read_json(r"C:\Tempor\Reddit\sentiment_data\google\2015_4_18.json")
# data

Map:   0%|          | 0/997 [00:00<?, ? examples/s]

100%|██████████| 200/200 [08:01<00:00,  2.41s/it]


Map:   0%|          | 0/1197 [00:00<?, ? examples/s]

100%|██████████| 240/240 [09:28<00:00,  2.37s/it]


In [None]:
# data_pandas=pd.read_json(r"C:\Tempor\Reddit\sentiment_data\amazon\2016_5_19.json", lines=True)
data_pandas_g=pd.read_json(r"C:\Tempor\Reddit\sentiment_data\google\2015_10_43.json", lines=True)

# data_pandas=data_pandas.rename(columns={"body": "text"})
data_pandas_g=data_pandas_g.rename(columns={"body": "text"})

# data_pandas_g

# predictions=ZS_self(data_pandas)

predictions_g=ZS_self(data_pandas_g)

#Positive: 0, Negative: 1

# len(predictions_g)

# p=pd.DataFrame(predictions.tolist(), columns=['Sentiment'])
# final_data=data_pandas.assign(Sentiment=p.values)
# final_data.to_json(r"C:\Tempor\Reddit\sentiment_data\amazon_a\2016_5_19.json")

p_g=pd.DataFrame(predictions_g.tolist(), columns=['Sentiment'])
final_data_g=data_pandas_g.assign(Sentiment=p_g.values)
final_data_g.to_json(r"C:\Tempor\Reddit\sentiment_data\google_a\2015_10_43.json")

# data=pd.read_json(r"C:\Tempor\Reddit\sentiment_data\google\2015_4_18.json")
# data

Map:   0%|          | 0/1463 [00:00<?, ? examples/s]

100%|██████████| 293/293 [12:12<00:00,  2.50s/it]
