In [None]:
import numpy as np
import pandas as pd
import os
import re
import itertools

# Wczytanie Danych

In [None]:
original_df = pd.read_csv('./Tweets.csv')
minified_df = original_df.drop(["selected_text", "textID"], axis=1)
minified_df

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative
...,...,...
27476,wish we could come see u on Denver husband l...,negative
27477,I`ve wondered about rake to. The client has ...,negative
27478,Yay good for both of you. Enjoy the break - y...,positive
27479,But it was worth it ****.,positive


# Analiza Danych

## Przekleństwa

In [None]:
df_with_curses = minified_df.copy(deep=True)
df_with_curses["curses"] = df_with_curses.apply(lambda row: str(row["text"]).count("****"), axis=1)
df_with_curses

Unnamed: 0,text,sentiment,curses
0,"I`d have responded, if I were going",neutral,0
1,Sooo SAD I will miss you here in San Diego!!!,negative,0
2,my boss is bullying me...,negative,0
3,what interview! leave me alone,negative,0
4,"Sons of ****, why couldn`t they put them on t...",negative,1
...,...,...,...
27476,wish we could come see u on Denver husband l...,negative,0
27477,I`ve wondered about rake to. The client has ...,negative,0
27478,Yay good for both of you. Enjoy the break - y...,positive,0
27479,But it was worth it ****.,positive,1


##### Wiadomości z największą ilością przekleństw

In [None]:
df_with_curses.sort_values("curses", ascending=False)[:10]

Unnamed: 0,text,sentiment,curses
13782,Um. Why can`t I write **** tonight? I like ***...,positive,4
24681,`Whats your stand on **** marriage?` - me. `wh...,negative,4
13872,"**** **** **** ****, please comment me back ILY",neutral,4
13340,_joyner And he can`t even tell me. Me and him...,negative,3
23866,"Ha **** im ready. but **** that ****, i cant ...",neutral,3
10709,i cant even call a **** man **** on bogus **...,neutral,3
27148,Someone just **** up birthday **** for me ... ...,negative,3
13901,"its from all that **** ****, i forgot the lube...",neutral,3
17617,HAHA Amen! Im sooo **** hungry... And I hate ...,negative,2
15848,"I`m at Cobra, Mexican bird flu hub/**** bar, a...",negative,2


##### Średnia przekleństw wśród wszystkich wiadomości


In [None]:
df_with_curses.groupby(["sentiment"]).mean(["curses"])

Unnamed: 0_level_0,curses
sentiment,Unnamed: 1_level_1
negative,0.073384
neutral,0.025094
positive,0.026567


#### Wnioski

Wiadomości, w których występuja przekleństwa znacznie częściej posiadają negatywny przekaz.

## Interpunkcja

In [None]:
df_question_marks = minified_df.copy(deep=True)
df_exclemation_marks = minified_df.copy(deep=True)

### Znaki zapytania

In [None]:
df_question_marks["question_marks"] = df_question_marks.apply(lambda row: str(row["text"]).count("?"), axis=1)
df_question_marks["question_marks_thresholding"] = df_question_marks.apply(lambda row: re.sub('\?{3,}', '???',  re.sub(r'\s+', '', str(row["text"]))).count("?"), axis=1)
df_question_marks["question_marks_over_threashold"] = df_question_marks["question_marks_thresholding"] > 5
df_question_marks

Unnamed: 0,text,sentiment,question_marks,question_marks_thresholding,question_marks_over_threashold
0,"I`d have responded, if I were going",neutral,0,0,False
1,Sooo SAD I will miss you here in San Diego!!!,negative,0,0,False
2,my boss is bullying me...,negative,0,0,False
3,what interview! leave me alone,negative,0,0,False
4,"Sons of ****, why couldn`t they put them on t...",negative,0,0,False
...,...,...,...,...,...
27476,wish we could come see u on Denver husband l...,negative,0,0,False
27477,I`ve wondered about rake to. The client has ...,negative,0,0,False
27478,Yay good for both of you. Enjoy the break - y...,positive,0,0,False
27479,But it was worth it ****.,positive,0,0,False


##### Wiadomości z największą ilością znaków zapytania

In [None]:
df_question_marks.sort_values("question_marks", ascending=False)[:10]

Unnamed: 0,text,sentiment,question_marks,question_marks_thresholding,question_marks_over_threashold
3293,Legend of Inotia ??????? ??????. ?????? ??????...,neutral,98,18,True
21418,??? weekend standby duty??????????????????????...,neutral,53,6,True
17339,_nobel ? ? ????????????? = ?????? ?????? ????...,neutral,39,6,True
1033,??????? #bash ?? ??????? ????????? ????,neutral,29,6,True
24999,"???? ? ??????, ?????? ????????,Basquash,K-ON...",neutral,25,6,True
4655,"???ï¿½ï¿½ ???CE d grade?mock?????, ???????",neutral,19,13,True
18346,sike sike call it truce???????????????? u sti...,negative,16,3,False
11767,????? http://is.gd/wxMt . ??????? ??? and may...,neutral,15,6,True
4922,I`ve been losing myself into too many Taiwanes...,negative,14,9,True
15552,"yï¿½n tï¿½m, sang n?m s? th?y **** m?c Tr?n t...",neutral,14,14,True


##### Wiadomości z największą ilością znaków zapytania po zastosowaniu progowania

In [None]:
df_question_marks.sort_values("question_marks_thresholding", ascending=False)[:10]

Unnamed: 0,text,sentiment,question_marks,question_marks_thresholding,question_marks_over_threashold
3293,Legend of Inotia ??????? ??????. ?????? ??????...,neutral,98,18,True
15552,"yï¿½n tï¿½m, sang n?m s? th?y **** m?c Tr?n t...",neutral,14,14,True
4655,"???ï¿½ï¿½ ???CE d grade?mock?????, ???????",neutral,19,13,True
4107,"Thank a Chu?t ï¿½ Ch?c v?y quï¿½, hik, ch?c ...",positive,13,13,True
7855,?ï¿½ nh?n ???c bonus r?i. Ch?c ch? ?? bï¿½ vï¿...,neutral,11,11,True
947,???-?? ? youtube ? ????,neutral,11,9,True
4922,I`ve been losing myself into too many Taiwanes...,negative,14,9,True
10801,What!?!?! I can`t believe I had to find out t...,negative,7,7,True
10190,Now that sucks... P?i ?i s? ï¿½n?eleg c? Jay...,negative,7,7,True
11669,SON! WTF?? She just bit holes in the **** bre...,negative,7,7,True


##### Średnia wystąpień znaków zapytania przed i po zastosowaniu progowania

In [None]:
df_question_marks.groupby(["sentiment"]).mean()

Unnamed: 0_level_0,question_marks,question_marks_thresholding,question_marks_over_threashold
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,0.140856,0.13533,0.000771
neutral,0.20921,0.185375,0.000989
positive,0.098345,0.098112,0.000233


##### Wnioski

Znaki zapytania występują głównie w wiadomościach neutralnych.
Często można je zauważyć także w zdaniach negatywnych w pytaniach ironicznych a także ciągach takich jak !?

### Wykrzykniki

In [None]:
df_exclemation_marks["exclamation_marks"] = df_exclemation_marks.apply(lambda row: str(row["text"]).count("!"), axis=1)
df_exclemation_marks["exclamation_marks_thresholding"] = df_exclemation_marks.apply(lambda row: re.sub('!{3,}', '!!!', re.sub(r'\s+', '', str(row["text"]))).count("!"), axis=1)
df_exclemation_marks["exclamation_marks_over_threshold"] = df_exclemation_marks["exclamation_marks_thresholding"] > 5
df_exclemation_marks

Unnamed: 0,text,sentiment,exclamation_marks,exclamation_marks_thresholding,exclamation_marks_over_threshold
0,"I`d have responded, if I were going",neutral,0,0,False
1,Sooo SAD I will miss you here in San Diego!!!,negative,3,3,False
2,my boss is bullying me...,negative,0,0,False
3,what interview! leave me alone,negative,1,1,False
4,"Sons of ****, why couldn`t they put them on t...",negative,0,0,False
...,...,...,...,...,...
27476,wish we could come see u on Denver husband l...,negative,0,0,False
27477,I`ve wondered about rake to. The client has ...,negative,0,0,False
27478,Yay good for both of you. Enjoy the break - y...,positive,0,0,False
27479,But it was worth it ****.,positive,0,0,False


##### Wiadomości z największą ilością wykrzykników

In [None]:
df_exclemation_marks.sort_values("exclamation_marks", ascending=False)[:10]

Unnamed: 0,text,sentiment,exclamation_marks,exclamation_marks_thresholding,exclamation_marks_over_threshold
6259,I have MAJOR CRAMPLES!!!!!!!!!!!!!!!!!!!!!!!!!...,negative,59,3,False
16638,I LOVE IT!!!!!!!!!!!!!!!!!!!!!! I ALSO LIKE T...,positive,42,6,True
20779,God **** you Twitter!!!!!!!!!!!!!! Stop eating...,negative,40,6,True
18352,Star trek was SOOOOO AWESOME!!!!!!!!!!!!!!!!!!...,positive,38,6,True
5052,thanx Tom love u too !!!!!!!!!!!!!!!!!!!!!!!...,positive,23,3,False
27197,Happy mother`s day nfty!!!!!!!!!!!!!!!!!!!!!!,positive,22,3,False
11168,NOOOOOOO!!!!!!!!!!!!! why in july? im gonna b...,negative,22,6,True
24137,OUCH!!!!!!!!!!!!!!!.........that really hurt!!!,negative,18,6,True
21843,Everythings Sooo Messed Up!!!!!!! Life Sucks!!...,negative,17,6,True
21025,GOOD LUCK ON FINALS EVERYONE!!!!!!!!!!!!!!!!!,positive,17,3,False


##### Wiadomości z największą ilością wykrzykników po zastosowaniu progowania

In [None]:
df_exclemation_marks.sort_values("exclamation_marks_thresholding", ascending=False)[:10]

Unnamed: 0,text,sentiment,exclamation_marks,exclamation_marks_thresholding,exclamation_marks_over_threshold
9425,PIRATE VOICE:AAARRRGGGHHH!!! I 4GOT MY **** WA...,negative,15,15,True
26889,UUUUUGH!!! I HATE I MISSED INTERVIEW!!! I WAS...,negative,12,12,True
4221,I am soo happy! But frustrated at the same tim...,positive,11,11,True
1388,Ahhhh!!!!!! Almost off!!! Can`t wait!! But..I ...,neutral,14,11,True
25023,UP is out today!!! Why why why why does my hea...,negative,10,10,True
22552,omg trying to fix pic but its not working!!!! ...,negative,13,9,True
20846,wahahahaha!! i wanna naaaaa!!! well...hapit n...,positive,9,9,True
16477,I have just read up on lactose stuff and I CAN...,positive,10,9,True
9389,saw Hannah Montana Movie today!! was the best!...,positive,15,9,True
21142,GRRRR!!! It`s BACK!!!!! And what I mean is my ...,negative,12,9,True


##### Średnia wystąpień wykrzykników przed i po zastosowaniu progowania

In [None]:
df_exclemation_marks.groupby(["sentiment"]).mean()

Unnamed: 0_level_0,exclamation_marks,exclamation_marks_thresholding,exclamation_marks_over_threshold
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,0.503534,0.450328,0.005655
neutral,0.427145,0.404389,0.003508
positive,0.772431,0.734677,0.007807


## Kapitalizacja

In [None]:
df_capitalize = minified_df.copy(deep=True)
df_capitalize["capitalize"] = df_capitalize.apply(lambda row: sum(1 for char in str(row["text"]) if char.isupper()) / len(str(row["text"])), axis=1)
df_capitalize["capitalize_over_threashold"] = (df_capitalize["capitalize"] >= 0.1) & (df_capitalize["capitalize"] <= 0.9)
df_capitalize

Unnamed: 0,text,sentiment,capitalize,capitalize_over_threashold
0,"I`d have responded, if I were going",neutral,0.055556,False
1,Sooo SAD I will miss you here in San Diego!!!,negative,0.152174,True
2,my boss is bullying me...,negative,0.000000,False
3,what interview! leave me alone,negative,0.000000,False
4,"Sons of ****, why couldn`t they put them on t...",negative,0.013333,False
...,...,...,...,...
27476,wish we could come see u on Denver husband l...,negative,0.012987,False
27477,I`ve wondered about rake to. The client has ...,negative,0.040984,False
27478,Yay good for both of you. Enjoy the break - y...,positive,0.027027,False
27479,But it was worth it ****.,positive,0.037037,False


##### Wiadomości z największą proporcją wielkich liter

In [None]:
df_capitalize.sort_values("capitalize", ascending=False)[:10]

Unnamed: 0,text,sentiment,capitalize,capitalize_over_threashold
391,BRAINFREEZE,neutral,1.0,False
11560,THE VIDEO IS FINALLY DONE WOOOOOOOOOOOOOOOOOOO...,neutral,0.956204,False
16979,HAPPY JUDDDAY,positive,0.923077,False
80,THANK YYYYYYYYYOOOOOOOOOOUUUUU!,positive,0.90625,False
25786,VEGA UNDER FIREEEEEE,negative,0.9,True
26707,WELCOMEEE BACKKKKK,neutral,0.894737,True
5177,FO SHOWWW,neutral,0.888889,True
19418,HOMEWORK BORRIING,negative,0.888889,True
15746,NEED SUMTHING FOR THIS EXCRUCIATING HEADACHE,negative,0.886364,True
831,GOODNIGHT MAGIC AND PRETTY WORLD,positive,0.875,True


##### Średnia wystąpień wielkich liter przed i po zastosowaniu progowania

In [None]:
df_capitalize.groupby('sentiment').mean()

Unnamed: 0_level_0,capitalize,capitalize_over_threashold
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1
negative,0.041347,0.073384
neutral,0.043719,0.085465
positive,0.048843,0.101144


## Powtarzające się litery

In [None]:
df_consecutive = minified_df.copy(deep=True)
df_consecutive["consecutive"] = df_consecutive.apply(lambda row: 0 if not re.sub(r'\s+', '', str(row["text"])) else max([max([sum(1 for _ in group) for _, group in itertools.groupby(word)]) for word in re.sub(r'\s+', ' ', str(row["text"])).strip().lower().split(' ')]), axis=1)
df_consecutive["consecutive_thresholding"] = df_consecutive.apply(lambda row: 0 if not re.sub(r'\s+', '', str(row["text"])) else max([max([sum(1 for _ in group) for _, group in itertools.groupby(word)]) for word in re.sub(r'((.)\2{2,})', r'\2\2\2', re.sub(r'\s+', ' ', str(row["text"])).strip().lower()).split(' ')]), axis=1)
df_consecutive["consecutive_over_threshold"] = df_consecutive["consecutive_thresholding"] > 5
df_consecutive

Unnamed: 0,text,sentiment,consecutive,consecutive_thresholding,consecutive_over_threshold
0,"I`d have responded, if I were going",neutral,1,1,False
1,Sooo SAD I will miss you here in San Diego!!!,negative,3,3,False
2,my boss is bullying me...,negative,3,3,False
3,what interview! leave me alone,negative,1,1,False
4,"Sons of ****, why couldn`t they put them on t...",negative,4,3,False
...,...,...,...,...,...
27476,wish we could come see u on Denver husband l...,negative,2,2,False
27477,I`ve wondered about rake to. The client has ...,negative,2,2,False
27478,Yay good for both of you. Enjoy the break - y...,positive,4,3,False
27479,But it was worth it ****.,positive,4,3,False


##### Wiadomości z największą ilością powtarzających się liter

In [None]:
df_consecutive.sort_values("consecutive", ascending=False)[:10]

Unnamed: 0,text,sentiment,consecutive,consecutive_thresholding,consecutive_over_threshold
11560,THE VIDEO IS FINALLY DONE WOOOOOOOOOOOOOOOOOOO...,neutral,109,3,False
6259,I have MAJOR CRAMPLES!!!!!!!!!!!!!!!!!!!!!!!!!...,negative,59,59,True
21418,??? weekend standby duty??????????????????????...,neutral,50,50,True
4888,Sitting in boring **** litterature listening t...,negative,36,4,False
18352,Star trek was SOOOOO AWESOME!!!!!!!!!!!!!!!!!!...,positive,35,35,True
12662,"Oh. The voting is over! But don`t worry, I ...",positive,35,6,True
9400,I`m currently into Emarosa. Their new album i...,negative,35,3,False
9152,": Oh. The voting is over! But don`t worry, I ...",positive,35,3,False
9683,Hi Everyone miss me much? muahhhhhhhhhhhhhhhh...,neutral,32,3,False
9622,shiiiiiiiiiiiiiiiiiiiiiiiiiiiiiit advanced dat...,negative,30,3,False


##### Wiadomości z największą ilością powtarzających sie liter po zastosowaniu progowania

In [None]:
df_consecutive.sort_values("consecutive_thresholding", ascending=False)[:10]

Unnamed: 0,text,sentiment,consecutive,consecutive_thresholding
6259,I have MAJOR CRAMPLES!!!!!!!!!!!!!!!!!!!!!!!!!...,negative,59,59
21418,??? weekend standby duty??????????????????????...,neutral,50,50
18352,Star trek was SOOOOO AWESOME!!!!!!!!!!!!!!!!!!...,positive,35,35
2012,low low low low low low low low low..............,neutral,29,29
2203,Your Highness...........................cryst...,neutral,27,27
20779,God **** you Twitter!!!!!!!!!!!!!! Stop eating...,negative,26,26
5052,thanx Tom love u too !!!!!!!!!!!!!!!!!!!!!!!...,positive,23,23
13646,"I left a chocolate egg sitting in my office, a...",neutral,22,22
16638,I LOVE IT!!!!!!!!!!!!!!!!!!!!!! I ALSO LIKE T...,positive,22,22
27197,Happy mother`s day nfty!!!!!!!!!!!!!!!!!!!!!!,positive,22,22


##### Odchylenie standardowe wiadomości z powtarzającymi się literami

In [None]:
df_consecutive.groupby('sentiment').std()

Unnamed: 0_level_0,consecutive,consecutive_thresholding,consecutive_over_threshold
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,1.74419,0.646633,0.0
neutral,1.761566,0.66778,0.0
positive,1.42565,0.620251,0.0


##### Średnia wystąpień powtarzających się liter przed i po zastosowaniu progowania

In [None]:
df_consecutive.groupby('sentiment').mean()

Unnamed: 0_level_0,consecutive,consecutive_thresholding,consecutive_over_threshold
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,2.510988,2.184167,0.0
neutral,2.243029,2.03229,0.0
positive,2.33815,2.121067,0.0


## Pokemon Case

In [None]:
df_pokemon = minified_df.copy(deep=True)

def case_swap_percent(sentence):
  sentence = re.sub(r'\s+', '', sentence)
  if not sentence or len(sentence) == 1:
    return 0
  count = 0
  for i in range(1, len(sentence)):
    if sentence[i].islower() != sentence[i-1].islower():
      count += 1
  return count / (len(sentence) - 1)

df_pokemon["pokemon"] = df_pokemon.apply(lambda row: case_swap_percent(str(row["text"])), axis=1)
df_pokemon["pokemon_length"] = df_pokemon.apply(lambda row: case_swap_percent(str(row["text"])) > 0.3 and len(str(row["text"])) > 10, axis=1)
df_pokemon

Unnamed: 0,text,sentiment,pokemon,pokemon_length
0,"I`d have responded, if I were going",neutral,0.178571,False
1,Sooo SAD I will miss you here in San Diego!!!,negative,0.228571,False
2,my boss is bullying me...,negative,0.050000,False
3,what interview! leave me alone,negative,0.080000,False
4,"Sons of ****, why couldn`t they put them on t...",negative,0.083333,False
...,...,...,...,...
27476,wish we could come see u on Denver husband l...,negative,0.067797,False
27477,I`ve wondered about rake to. The client has ...,negative,0.135417,False
27478,Yay good for both of you. Enjoy the break - y...,positive,0.080460,False
27479,But it was worth it ****.,positive,0.105263,False


##### Wiadomości z największą częstotliwością występowania Pokemon Case

In [None]:
df_pokemon.sort_values("pokemon", ascending=False)[:10]

Unnamed: 0,text,sentiment,pokemon,pokemon_length
14311,CoNvErSaTiNg,neutral,1.0,True
2170,wEe ArR SoWbUr i PrOmIsS. tHe StYoOpId FlYiNg...,neutral,0.927083,True
5085,"cALL mE wiErD, bUt I jUs LuV rAiNy DaYs! MaKeS...",positive,0.854839,True
1629,Hi Rob!,neutral,0.8,False
18527,Aw Yay,positive,0.75,False
20930,woOt!,neutral,0.75,False
9173,i HaVe 2 Get OuT This HoUse,neutral,0.7,True
26370,Bye.,neutral,0.666667,False
6031,What To Say?,neutral,0.666667,True
17647,No! Why?,neutral,0.666667,False


##### Średnia wystąpień Pokemon Case przed i po uwzględnieniu minimalnej długości tekstu

In [None]:
df_pokemon.groupby('sentiment').mean()

Unnamed: 0_level_0,pokemon,pokemon_length
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1
negative,0.109329,0.011567
neutral,0.120407,0.025184
positive,0.124654,0.029014


## Hashtag'i

In [None]:
df_hash = minified_df.copy(deep=True)

def count_hash(sentence):
  return len(re.findall(r'#\w+', sentence)) / len(re.sub(r'\s+', ' ', sentence.strip()).split(' '))

df_hash["hashtags"] = df_hash.apply(lambda row: count_hash(str(row["text"])), axis=1)
df_hash

##### Wiadomości z największą proporcją hashtag'ów do ilości słów

In [None]:
df_hash.sort_values("hashtags", ascending=False)[:10]

Unnamed: 0,text,sentiment,hashtags
17124,Welcome #Follow #Freude,neutral,0.666667
13770,schade #tv_addict,neutral,0.5
21341,Thanks #sigjeans,positive,0.5
18517,LOL #yourock,positive,0.5
698,#stackeoverflow http://bit.ly/13Nfk1,neutral,0.5
4016,rlly dead? Descanse em paz... #luto #sad #RIP,negative,0.375
15203,I know. #hhrs,neutral,0.333333
16093,morning!!!! happy #juddday,positive,0.333333
24896,missed the #jonaswebcast,neutral,0.333333
6027,"happy #juddday, carrie",positive,0.333333


##### Średnia wystąpień hashtag'ów

In [None]:
df_hash = df_hash[df_hash["hashtags"] > 0]
df_hash.groupby('sentiment').mean()

Unnamed: 0_level_0,hashtags
sentiment,Unnamed: 1_level_1
negative,0.094991
neutral,0.104854
positive,0.107842


## Litery z poza tablicy ASCII

In [None]:
df_non_ascii = minified_df.copy(deep=True)

def count_non_ascii(sentence):
  return 1 - len(sentence.encode("ascii", "ignore")) / len(sentence)

df_non_ascii["non_ascii"] = df_non_ascii.apply(lambda row: count_non_ascii(str(row["text"])), axis=1)
df_non_ascii = df_non_ascii[df_non_ascii["non_ascii"] > 0]
df_non_ascii

Unnamed: 0,text,sentiment,non_ascii
44,I love to! But I`m only available from 5pm. ...,positive,0.029412
192,*phew* Will make a note in case anyone else ...,neutral,0.041096
432,"I love mine, too . happy motherï¿½s day to yo...",positive,0.032258
645,meeting just in time that iï¿½m trying to win ...,neutral,0.041667
854,Just got confirmed that itï¿½s pizza-time with...,positive,0.030303
...,...,...,...
26472,..uuuups today is mother day???....sh***....i...,neutral,0.048000
26745,i canï¿½t choose one i love all the songs on...,positive,0.030928
26882,XDXDXD you crazy little thing why didnï¿½t ...,neutral,0.036585
27087,Shared Kim Hï¿½ltermand - Portfolio: Shared by...,positive,0.037313


##### Wiadomości z największą ilością liter z poza tablicy ASCII

In [None]:
df_non_ascii.sort_values("non_ascii", ascending=False)[:10]

Unnamed: 0,text,sentiment,non_ascii
13965,hï¿½rlich!,neutral,0.272727
2182,Next song to be axed by me Ombra mai fï¿½ -ï¿½...,neutral,0.184615
15552,"yï¿½n tï¿½m, sang n?m s? th?y **** m?c Tr?n t...",neutral,0.177778
20092,Dï¿½a sin sol... grrrrrrr.... ï¿½why? ï¿½WHYYY...,neutral,0.166667
7973,Iï¿½m sorry for that,negative,0.142857
4655,"???ï¿½ï¿½ ???CE d grade?mock?????, ???????",neutral,0.139535
12418,Dï¿½jï¿½ vu!!! Changing out of my GQ outfit,neutral,0.139535
16725,"Buffett Lambastes Bankers, Insurers for ï¿½Gre...",neutral,0.139535
5310,whatï¿½s hot and new?,neutral,0.136364
7855,?ï¿½ nh?n ???c bonus r?i. Ch?c ch? ?? bï¿½ vï¿...,neutral,0.134328


##### Ochylenie standardowe proportcji znaków z poza tablicy ASCII

In [None]:
df_non_ascii.groupby('sentiment').std()

Unnamed: 0_level_0,non_ascii
sentiment,Unnamed: 1_level_1
negative,0.02853
neutral,0.046693
positive,0.023035


## Długość tekstu

In [None]:
df_length = minified_df.copy(deep=True)

df_length["length"] = df_length.apply(lambda row: len(str(row["text"])), axis=1)
df_length["short"] = df_length["length"] <= 10
df_length["normal"] = (df_length["length"] > 10) & (df_length["length"] < 30)
df_length["long"] = df_length["length"] > 30

df_length

Unnamed: 0,text,sentiment,length,short,normal,long
0,"I`d have responded, if I were going",neutral,36,False,False,True
1,Sooo SAD I will miss you here in San Diego!!!,negative,46,False,False,True
2,my boss is bullying me...,negative,25,False,True,False
3,what interview! leave me alone,negative,31,False,False,True
4,"Sons of ****, why couldn`t they put them on t...",negative,75,False,False,True
...,...,...,...,...,...,...
27476,wish we could come see u on Denver husband l...,negative,77,False,False,True
27477,I`ve wondered about rake to. The client has ...,negative,122,False,False,True
27478,Yay good for both of you. Enjoy the break - y...,positive,111,False,False,True
27479,But it was worth it ****.,positive,27,False,True,False


##### Wiadomości z największą długością tekstu

In [None]:
df_length.sort_values("length")[:10]

Unnamed: 0,text,sentiment,length,short,normal,long
11524,aw,neutral,3,True,False,False
314,,neutral,3,True,False,False
641,no,neutral,3,True,False,False
14339,ow,neutral,3,True,False,False
26005,?,neutral,3,True,False,False
25699,ME,neutral,3,True,False,False
19964,boo,neutral,4,True,False,False
238,Thx,neutral,4,True,False,False
10676,yes,neutral,4,True,False,False
11456,yup,neutral,4,True,False,False


##### Dystrybucja sentymentu na podstawie długości tekstu

In [None]:
df_length.groupby('sentiment').mean()

Unnamed: 0_level_0,length,short,normal,long
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
negative,70.488112,0.006297,0.12402,0.860429
neutral,65.201205,0.020327,0.176111,0.793398
positive,70.419133,0.006991,0.123281,0.861687


# Czysty BERT

In [None]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 28.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 66.2 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 59.6 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 11.3 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    U

In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
from transformers import AutoTokenizer, AutoConfig
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Downloading:   0%|          | 0.00/929 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def preprocess(text):
    new_text = []

    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

def predict(text):
    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    predict_value = ranking[-1]
    return predict_value


In [None]:
from sklearn.model_selection import train_test_split

bert_df = minified_df.copy(deep=True)

bert_df['text'] = bert_df['text'].apply(lambda x: preprocess(str(x)))
bert_df['predict'] = bert_df['text'].apply(lambda x:predict(str(x)))

train, test = train_test_split(bert_df, test_size=0.2)

In [None]:
bert_df['sentiment'] = bert_df['sentiment'].apply(lambda x: 2 if x == 'positive' else 1 if x == 'neutral' else 0)

In [None]:
bert_df

Unnamed: 0,text,sentiment,predict
0,"I`d have responded, if I were going",1,1
1,Sooo SAD I will miss you here in San Diego!!!,0,0
2,my boss is bullying me...,0,0
3,what interview! leave me alone,0,0
4,"Sons of ****, why couldn`t they put them on t...",0,0
...,...,...,...
27476,wish we could come see u on Denver husband l...,0,1
27477,I`ve wondered about rake to. The client has ...,0,1
27478,Yay good for both of you. Enjoy the break - y...,2,2
27479,But it was worth it ****.,2,2


In [None]:
import sklearn.metrics as skm

cm = skm.confusion_matrix(bert_df['sentiment'], bert_df['predict'])
print(cm)

print(skm.classification_report(bert_df['sentiment'], bert_df['predict']))

[[6254  986  541]
 [2481 5625 3012]
 [ 344  685 7553]]
              precision    recall  f1-score   support

           0       0.69      0.80      0.74      7781
           1       0.77      0.51      0.61     11118
           2       0.68      0.88      0.77      8582

    accuracy                           0.71     27481
   macro avg       0.71      0.73      0.71     27481
weighted avg       0.72      0.71      0.70     27481



In [None]:
text = "Lol man, wtf?!"
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

In [None]:
ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = config.id2label[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

1) Negative 0.7158
2) Neutral 0.2284
3) Positive 0.0558


# BERT na naszych danych

In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
from transformers import AutoTokenizer, AutoConfig
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"

tokenizer = AutoTokenizer.from_pretrained(MODEL, add_prefix_space=True, use_fast=False)

config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=3)
model.save_pretrained(MODEL)

Downloading:   0%|          | 0.00/929 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
bert_df = minified_df.copy(deep=True)

In [None]:
from datasets import Dataset

new_bert_df = Dataset.from_pandas(bert_df)

In [None]:
import torch

def preprocess_function(examples):
  examples = str(examples["text"])
  new_text = []

  for t in examples.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
  examples = " ".join(new_text)
  return tokenizer(examples, truncation=True)

In [None]:
new_bert_df = new_bert_df.map(preprocess_function)
new_bert_df = new_bert_df.map(lambda x: {"sentiment": 2} if x["sentiment"] == 'positive' else {"sentiment": 1} if x["sentiment"] == 'neutral' else {"sentiment": 0})

new_bert_df = new_bert_df.remove_columns("text")
new_bert_df = new_bert_df.rename_column("sentiment", "label")
new_bert_df[0]

  0%|          | 0/27481 [00:00<?, ?ex/s]

  0%|          | 0/27481 [00:00<?, ?ex/s]

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'input_ids': [0, 38, 12905, 417, 33, 2334, 6, 114, 38, 58, 164, 2],
 'label': 1}

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

new_bert_df = new_bert_df.train_test_split(test_size=0.2)

new_bert_df

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 21984
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 5497
    })
})

In [None]:
new_train_bert_df = new_bert_df["train"].train_test_split(test_size=0.15)

new_train_bert_df

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 18686
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 3298
    })
})

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    optim="adamw_torch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=new_train_bert_df["train"],
    eval_dataset=new_train_bert_df["test"],
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 18686
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5840


Step,Training Loss
500,0.5654
1000,0.5089
1500,0.4316
2000,0.3897
2500,0.3634
3000,0.2892
3500,0.2862
4000,0.2071
4500,0.2096
5000,0.1688


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1500/special_toke

TrainOutput(global_step=5840, training_loss=0.3145116636197861, metrics={'train_runtime': 961.5512, 'train_samples_per_second': 97.166, 'train_steps_per_second': 6.074, 'total_flos': 1800867454197840.0, 'train_loss': 0.3145116636197861, 'epoch': 5.0})

In [None]:
def preprocess(text):
    new_text = []

    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

def predict(encoded_input):
    encoded_input.pop('label', None)
    encoded_input["input_ids"] = torch.IntTensor([encoded_input["input_ids"]])
    encoded_input["attention_mask"] = torch.Tensor([encoded_input["attention_mask"]])
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    predict_value = ranking[-1]
    return predict_value

In [None]:
model.to('cpu')

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [None]:
from tqdm import tqdm

references = new_bert_df["test"]["label"]

predictions = []

for i in tqdm(new_bert_df["test"]):
  predictions.append(predict(i))

100%|██████████| 5497/5497 [09:45<00:00,  9.40it/s]


In [None]:
import sklearn.metrics as skm

cm = skm.confusion_matrix(references, predictions)
print(cm)

print(skm.classification_report(references, predictions))

[[1259  245   47]
 [ 291 1603  295]
 [  36  226 1495]]
              precision    recall  f1-score   support

           0       0.79      0.81      0.80      1551
           1       0.77      0.73      0.75      2189
           2       0.81      0.85      0.83      1757

    accuracy                           0.79      5497
   macro avg       0.79      0.80      0.80      5497
weighted avg       0.79      0.79      0.79      5497



# BERT na dodatkowych danych

In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
from transformers import AutoTokenizer, AutoConfig
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
import torch

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"

tokenizer = AutoTokenizer.from_pretrained(MODEL, add_prefix_space=True, use_fast=False)

config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=3)
model.save_pretrained(MODEL)

In [None]:
bert_df = minified_df.copy(deep=True)

In [None]:
from datasets import Dataset

new_bert_df = Dataset.from_pandas(bert_df)

In [None]:
def count_non_ascii(sentence):
  return 1 - len(sentence.encode("ascii", "ignore")) / len(sentence)

def update_dataset(dataset):
  dataset["curses"] = str(dataset["text"]).count("****")
  dataset["exclamation_marks"] = re.sub('!{3,}', '!!!', re.sub(r'\s+', '', str(dataset["text"]))).count("!") > 5
  dataset["question_marks"] = re.sub('\?{3,}', '???',  re.sub(r'\s+', '', str(dataset["text"]))).count("?") > 5
  dataset["non_ascii"] = count_non_ascii(str(dataset["text"]))
  dataset["short"] = len(str(dataset["text"])) <= 10
  dataset["long"] = len(str(dataset["text"])) > 30
  return dataset

def preprocess_function(examples):
  examples = str(examples["text"])
  new_text = []

  for t in examples.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
  examples = " ".join(new_text)
  return tokenizer(examples, truncation=True)

In [None]:
new_bert_df = new_bert_df.map(lambda x: update_dataset(x))
new_bert_df = new_bert_df.map(preprocess_function)
new_bert_df = new_bert_df.map(lambda x: {"sentiment": 2} if x["sentiment"] == 'positive' else {"sentiment": 1} if x["sentiment"] == 'neutral' else {"sentiment": 0})

new_bert_df = new_bert_df.remove_columns("text")
new_bert_df = new_bert_df.rename_column("sentiment", "label")
new_bert_df[0]

  0%|          | 0/27481 [00:00<?, ?ex/s]

  0%|          | 0/27481 [00:00<?, ?ex/s]

  0%|          | 0/27481 [00:00<?, ?ex/s]

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'curses': 0,
 'exclamation_marks': False,
 'input_ids': [0, 38, 12905, 417, 33, 2334, 6, 114, 38, 58, 164, 2],
 'label': 1,
 'long': True,
 'non_ascii': 0.0,
 'question_marks': False,
 'short': False}

In [None]:
new_bert_df

Dataset({
    features: ['label', 'curses', 'exclamation_marks', 'question_marks', 'non_ascii', 'short', 'long', 'input_ids', 'attention_mask'],
    num_rows: 27481
})

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

new_bert_df = new_bert_df.train_test_split(test_size=0.2)

new_bert_df

DatasetDict({
    train: Dataset({
        features: ['label', 'curses', 'exclamation_marks', 'question_marks', 'non_ascii', 'short', 'long', 'input_ids', 'attention_mask'],
        num_rows: 21984
    })
    test: Dataset({
        features: ['label', 'curses', 'exclamation_marks', 'question_marks', 'non_ascii', 'short', 'long', 'input_ids', 'attention_mask'],
        num_rows: 5497
    })
})

In [None]:
new_train_bert_df = new_bert_df["train"].train_test_split(test_size=0.15)

new_train_bert_df

DatasetDict({
    train: Dataset({
        features: ['label', 'curses', 'exclamation_marks', 'question_marks', 'non_ascii', 'short', 'long', 'input_ids', 'attention_mask'],
        num_rows: 18686
    })
    test: Dataset({
        features: ['label', 'curses', 'exclamation_marks', 'question_marks', 'non_ascii', 'short', 'long', 'input_ids', 'attention_mask'],
        num_rows: 3298
    })
})

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    optim="adamw_torch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=new_train_bert_df["train"],
    eval_dataset=new_train_bert_df["test"],
    tokenizer=tokenizer,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.train()

***** Running training *****
  Num examples = 18686
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5840
The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: exclamation_marks, curses, non_ascii, short, long, question_marks. If exclamation_marks, curses, non_ascii, short, long, question_marks are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.


Step,Training Loss
500,0.3323
1000,0.3298
1500,0.269
2000,0.2407
2500,0.2027
3000,0.178
3500,0.1702
4000,0.1098
4500,0.1238
5000,0.0946


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1500/special_toke

TrainOutput(global_step=5840, training_loss=0.18686922870270192, metrics={'train_runtime': 978.203, 'train_samples_per_second': 95.512, 'train_steps_per_second': 5.97, 'total_flos': 1797076976523696.0, 'train_loss': 0.18686922870270192, 'epoch': 5.0})

In [None]:
def preprocess(text):
    new_text = []

    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

def predict(encoded_input):
    encoded_input.pop('label', None)
    temp_dict = dict()
    temp_dict["input_ids"] = torch.IntTensor([encoded_input["input_ids"]])
    temp_dict["attention_mask"] = torch.Tensor([encoded_input["attention_mask"]])
    output = model(**temp_dict)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    predict_value = ranking[-1]
    return predict_value

In [None]:
model.to('cpu')

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [None]:
from tqdm import tqdm

references = new_bert_df["test"]["label"]

predictions = []

for i in tqdm(new_bert_df["test"]):
  predictions.append(predict(i))

100%|██████████| 5497/5497 [09:37<00:00,  9.51it/s]


In [None]:
import sklearn.metrics as skm

cm = skm.confusion_matrix(references, predictions)
print(cm)

print(skm.classification_report(references, predictions))

[[1333  178   19]
 [ 143 1995  135]
 [  17  180 1497]]
              precision    recall  f1-score   support

           0       0.89      0.87      0.88      1530
           1       0.85      0.88      0.86      2273
           2       0.91      0.88      0.90      1694

    accuracy                           0.88      5497
   macro avg       0.88      0.88      0.88      5497
weighted avg       0.88      0.88      0.88      5497

