In [1]:
import pandas as pd
import re
import ast

In [2]:
df4 = pd.read_pickle("/home/student/HallucinationsLLM/data/team4_df.pkl")
df5 = pd.read_excel("/home/student/HallucinationsLLM/data/team5_clean_dataset.xlsx", index_col=0)

In [3]:
image_overlap = list(set(df4.index).intersection(df5['image_link']))
image_overlap

['https://cdn.pixabay.com/photo/2016/11/18/13/23/action-1834465_1280.jpg',
 'https://cdn.pixabay.com/photo/2016/11/29/05/26/beach-1867524_1280.jpg',
 'https://cdn.pixabay.com/photo/2019/11/29/08/34/space-4660847_1280.jpg']

In [4]:
def get_max_key_value(d):
    if d:
        max_key = max(d, key=d.get)
        return max_key, d[max_key]
    else:
        return None, None

def process_df4(df):
    df['hallucinations'].fillna(df['hillucination_text'], inplace=True)
    df = df.drop('hillucination_text', axis=1)
    df['image_link'] = df.index
    df = df.reset_index(drop=True)
    for i in range(1,5):
        df[[f'pred_{i}', f'pred_{i}_prob']] = df[f'pred_{i}'].apply(lambda x: pd.Series(get_max_key_value(x)))
    
    df.rename(columns={'text': 'description', 'generated_logits': 'logits'}, inplace=True)
    df['temperature'] = 0.7
    return df

In [5]:
df4 = process_df4(df4)
df = pd.concat([df4, df5]).reset_index(drop=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['hallucinations'].fillna(df['hillucination_text'], inplace=True)


In [6]:
df4.to_excel('team4_df.xlsx')

In [12]:
df = df4

In [13]:
def validate_brackets(string):
    counter = 0
    for c in string: 
        if c == "[":
            counter += 1 
        elif c == "]":
            counter -= 1
        if counter < 0 or counter > 1:
            return False 
    return True

def validate_spaces(s):
    if "[" not in s:
        return True
    matches = re.finditer(r'\[.*?\]', s)
    for match in matches:
        start, end = match.span()
        # Check character before the match
        if start > 0 and s[start - 1].isalpha():
            return False
        # Check character after the match
        if end < len(s) and s[end].isalpha():
            return False
    return True

def check(logits):
    tokens = []
    for token, _ in logits: 
        if "<" in token or ">" in token:
            tokens.append(token)
    return list(set(tokens))


def clean_text(text):
    # Remove specific substrings
    text = re.sub(r'<0x0A>', ' ', text)
    text = re.sub(r'</s>', '', text)
    # Remove any other unwanted patterns (adjust the regex if needed)
    text = re.sub(r'<[^>]*>', '', text)
    return text.strip()

def count_words(text):
    words = text.split()
    return len(words)

def extract_brackets_len(text):
    lens = []
    cleaned_text = clean_text(text)
    words = cleaned_text.split()
    for i, word in enumerate(words):
        if "[" in word:
            counter = 0
            for j in range(i, len(words)):
                counter += 1
                if "]" in words[j]:
                    break
            lens.append(counter)
    return lens

def dot_in_hal(text):
    lens = []
    cleaned_text = clean_text(text)
    words = cleaned_text.split()
    for i, word in enumerate(words):
        if "[" in word:
            counter = 0
            for j in range(i, len(words)):
                counter += 1
                if "." in words[j]:
                    print(words[j])
                if "]" in words[j]:
                    break
            lens.append(counter)
    return lens


def count_brackets(string):
    counter = 0
    for c in string: 
        if c == "[":
            counter += 1 
    return counter


def columns_compatibility(description, hallucinations, hedges, context1, context2, context3, context4):
    text_set = set()
    text_set.add(description.replace(" ", ""))
    text_set.add(hallucinations.replace("[", "").replace("]", "").replace(" ", ""))
    # text_set.add(hedges.replace("[", "").replace("]", "").replace(" ", ""))
    # text_set.add(context1.replace("[", "").replace("]", "").replace(" ", ""))
    # text_set.add(context2.replace("[", "").replace("]", "").replace(" ", ""))
    # text_set.add(context3.replace("[", "").replace("]", "").replace(" ", ""))
    # text_set.add(context4.replace("[", "").replace("]", "").replace(" ", ""))
    return len(text_set)

In [14]:
rows_compatibilty = df.apply(lambda row: columns_compatibility(row['description'], row['hallucinations'], row['hedges'], row['context_1'], row['context_2'], row['context_3'], row['context_4']), axis=1)
rows_compatibilty[rows_compatibilty != 1]

50    2
88    2
dtype: int64

In [15]:
a = df['hallucinations'].apply(count_brackets)
a[a == 0].index

Index([114, 115], dtype='int64')

In [16]:
cols_with_brackets = ['hallucinations', 'hedges', 'context_1', 'context_2', 'context_3', 'context_4']
for col in cols_with_brackets:
    validation_result = df[col].apply(lambda x: validate_brackets(x))
    passed_num = validation_result.astype(int).sum() 
    valid_spaces = df[col].apply(validate_spaces)
    valid_space_num = valid_spaces.astype(int).sum() 
    if 'context' in col:
        brackets_count = df[col].apply(count_brackets)
        zero_brackets = brackets_count[brackets_count == 0]
        if len(zero_brackets) > 0:
            print(f"{col} has zero brackets: {zero_brackets.index.tolist()}")
    if passed_num != len(df):
        print(f"{col} validation test failed: {passed_num}")
        print("indexes", validation_result[validation_result == False].index.tolist())
    if valid_space_num != len(df):
        print(f"{col} space test failed: {valid_space_num}")
        print("indexes", valid_spaces[valid_spaces == False].index.tolist())

hedges validation test failed: 199
indexes [156]
hedges space test failed: 199
indexes [184]
context_1 space test failed: 198
indexes [15, 27]
context_2 space test failed: 196
indexes [5, 8, 38, 47]
context_3 validation test failed: 198
indexes [177, 193]
context_4 space test failed: 197
indexes [5, 8, 9]


In [23]:
df['logits'] = df['logits'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
lst = df['logits'].apply(check).tolist()
flattened_list = [item for sublist in lst for item in sublist]
set(flattened_list)

{'</s>', '<0x0A>'}

In [34]:
df['logits'].apply(lambda x: len(x)).value_counts()

logits
100    136
150     50
200     14
Name: count, dtype: int64

In [31]:
for i in range(len(df['description'])):
    print(i, len(df['description'].iloc[i].split(" ")))

0 100
1 100
2 100
3 100
4 100
5 100
6 100
7 100
8 100
9 100
10 100
11 100
12 100
13 100
14 70
15 65
16 75
17 77
18 79
19 71
20 73
21 76
22 78
23 78
24 77
25 82
26 81
27 81
28 81
29 83
30 72
31 82
32 79
33 68
34 80
35 67
36 81
37 77
38 85
39 80
40 84
41 71
42 79
43 82
44 77
45 79
46 70
47 79
48 83
49 61
50 82
51 78
52 78
53 78
54 78
55 76
56 75
57 70
58 77
59 79
60 68
61 76
62 86
63 75
64 75
65 69
66 80
67 80
68 77
69 78
70 76
71 79
72 75
73 81
74 82
75 81
76 69
77 76
78 79
79 58
80 72
81 71
82 79
83 77
84 76
85 74
86 78
87 78
88 69
89 78
90 78
91 76
92 80
93 76
94 74
95 74
96 84
97 79
98 69
99 71
100 108
101 121
102 113
103 115
104 118
105 118
106 105
107 112
108 125
109 101
110 110
111 114
112 118
113 108
114 116
115 114
116 114
117 106
118 115
119 119
120 100
121 107
122 101
123 107
124 123
125 112
126 108
127 114
128 113
129 114
130 115
131 113
132 117
133 117
134 110
135 111
136 108
137 119
138 110
139 110
140 111
141 112
142 121
143 114
144 113
145 108
146 93
147 114
148 109
149 1

In [17]:
_ = df['hallucinations'].apply(dot_in_hal)

laptop].
vase].
[backrest].
racket].
camera].
camera].
hair].
right].
floor].
window].
surface].
it].
Sahara].
table].
ant].
performance].
organ].
[upwards].
outwards].
surface].
shoulders].
background].
alley].
side].
other].
sign].
angle].
desk].
water].
anticipation].
ball].
desk].
image].
[swimming].
Spades].
prank].
half].
shoulders].
face].
[genders].
[tees].
[2000].
upwards].
counter].
[grass].
[side].
[jam].
[sidewalk].
[camera].
shoulder].
[behind].
[camera].
bicycles].
[rope].
[boat].
[bicycles].
[bicycle].
[water].
[white].
[hands].
[right].
[extended].
[beard].
[holding].
elevated].
left].
[him].
[banana].
(CHF)].
[circle].
white].
[stone].
individuals].
hand].
clarinets].
[piano].
celebration].
three].


In [18]:
df['hal_lens'] = df['hallucinations'].apply(extract_brackets_len)
hal_lens = []
for len_list in df['hal_lens'].values:
    hal_lens.extend(len_list)

print(pd.Series(hal_lens).value_counts())

def is_max_greater_than_2(lst):
    if len(lst) == 0:
        return False
    return max(lst) >= 2

df[df['hal_lens'].apply(is_max_greater_than_2)].index

1     326
2      54
3      30
4      22
5      14
6      11
8       6
9       5
7       3
13      2
11      1
15      1
12      1
18      1
Name: count, dtype: int64


Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,  10,
       ...
       177, 181, 182, 186, 187, 193, 194, 196, 197, 198],
      dtype='int64', length=102)