# Collect all code in one dataframe

In [None]:
import pandas as pd
#import plt
import matplotlib.pyplot as plt
import numpy as np
import glob

# Get a list of all CSV files in the running directory
csv_files = glob.glob('*.csv')

# Load each CSV file into a dataframe and concatenate them into a single dataframe
df_list = [pd.read_csv(file) for file in csv_files]
combined_df = pd.concat(df_list, ignore_index=True)

#Drops the first column of the dataframe
combined_df = combined_df.drop(combined_df.columns[0], axis=1)

print(combined_df.head())

# Save the combined dataframe to a new CSV file
#combined_df.to_csv('combined_csv.csv', index=False)

In [None]:
# Find the user who has sent the most messages
most_active_user = combined_df['author'].value_counts().idxmax()
most_messages_count = combined_df['author'].value_counts().max()

print(f"The user who has sent the most messages is: {most_active_user} with {most_messages_count} messages.")

# Find the user who has sent the least messages
least_active_user = combined_df['author'].value_counts().idxmin()

#print all user message count
print(combined_df['author'].value_counts())

In [None]:
# Get the count of messages sent by each user
user_message_counts = combined_df['author'].value_counts()

# Plot the pie chart
plt.figure(figsize=(10, 8))
plt.pie(user_message_counts, labels=user_message_counts.index, autopct='%1.1f%%', startangle=140)
plt.axis('equal')
plt.title('Messages Sent by Each User')
plt.show()

In [None]:
# Convert the timestamp column to datetime
combined_df['timestamp'] = pd.to_datetime(combined_df['timestamp'], format='ISO8601')

# Extract the date from the timestamp
combined_df['date'] = combined_df['timestamp'].dt.date

# Find the date with the most messages
most_messages_date = combined_df['date'].value_counts().idxmax()
most_messages_date_count = combined_df['date'].value_counts().max()

print(f"The date with the most messages is: {most_messages_date} with {most_messages_date_count} messages.")

In [None]:
# Group by date and count the number of messages per day
messages_per_day = combined_df.groupby('date').size()

# Plot the data
plt.figure(figsize=(12, 6))
messages_per_day.plot(kind='line')
plt.xlabel('Date')
plt.ylabel('Number of Messages')
plt.title('Number of Messages Sent Per Day')
plt.grid(True)
plt.show()

In [None]:
from collections import Counter
import re

# Combine all messages into a single string
all_messages = ' '.join(combined_df['message'].dropna())

# Use regular expressions to find all words
words = re.findall(r'\b\w+\b', all_messages.lower())

# Count the frequency of each word
word_counts = Counter(words)

# Find the 10 most common words
most_common_words = word_counts.most_common(200)

print("The 10 most common words are:")
i = 0
for word, count in most_common_words:
    i += 1
    print(f"{i}. : {word}: {count}")


In [None]:
# Define the specific word you want to search for
specific_word = 'arne'

# Get the count of the specific word
specific_word_count = word_counts[specific_word]

print(f"The word '{specific_word}' has been written {specific_word_count} times.")

In [None]:
# Extract the hour from the timestamp
combined_df['hour'] = combined_df['timestamp'].dt.hour

# Find the hour with the most messages
most_messages_hour = combined_df['hour'].value_counts().idxmax()
most_messages_hour_count = combined_df['hour'].value_counts().max()
least_messages_hour = combined_df['hour'].value_counts().idxmin()
least_messages_hour_count = combined_df['hour'].value_counts().min()

print(f"The hour with the most messages is: {most_messages_hour}:00 with {most_messages_hour_count} messages.")
print(f"The hour with the most messages is: {least_messages_hour}:00 with {least_messages_hour_count} messages.")


In [None]:
# Extract the words and their counts
words, counts = zip(*most_common_words[:20])  # Get the top 10 most common words

# Plot the pie chart
plt.figure(figsize=(10, 8))
plt.pie(counts, labels=words, autopct='%1.1f%%', startangle=140)
plt.axis('equal')
plt.title('Top 10 Most Common Words')
plt.show()

In [None]:
# Extract the words and their counts for the top 50 most common words
top_50_words, top_50_counts = zip(*most_common_words[:50])

# Plot the line graph
plt.figure(figsize=(14, 7))
plt.plot(top_50_words, top_50_counts, marker='o')
plt.xticks(rotation=90)
plt.xlabel('Words')
plt.ylabel('Counts')
plt.title('Top 50 Most Common Words')
plt.grid(True)
plt.show()

In [None]:
# Extract the hour from the timestamp
combined_df['hour'] = combined_df['timestamp'].dt.hour

# Get the count of messages sent in each hour
messages_per_hour = combined_df['hour'].value_counts().sort_index()

# Plot the histogram
plt.figure(figsize=(12, 6))
messages_per_hour.plot(kind='bar')
plt.xlabel('Hour of the Day')
plt.ylabel('Number of Messages')
plt.title('Number of Messages Sent by Hour of the Day')
plt.xticks(range(24), [f'{i}:00' for i in range(24)])
plt.grid(True)
plt.show()

In [None]:
# Get the list of unique authors
#unique_authors = combined_df['author'].unique()

unique_authors = []

# Create a figure for the plots
plt.figure(figsize=(14, 10))

# Loop through each author and plot their message counts per hour
for author in unique_authors:
    # Filter the dataframe for the specific author
    author_df = combined_df[combined_df['author'] == author]
    
    # Get the count of messages sent by the author in each hour
    author_messages_per_hour = author_df['hour'].value_counts().sort_index()
    
    # Plot the data
    plt.plot(author_messages_per_hour.index, author_messages_per_hour.values, marker='o', label=author)

# Add labels and title
plt.xlabel('Hour of the Day')
plt.ylabel('Number of Messages')
plt.title('Number of Messages Sent by Each User by Hour of the Day')
plt.xticks(range(24), [f'{i}' for i in range(24)])
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Define the specific author you want to analyze
specific_author = ''

# Filter the dataframe for the specific author
author_df = combined_df[combined_df['author'] == specific_author]

# Get the count of messages sent by the author in each hour
author_messages_per_hour = author_df['hour'].value_counts().sort_index()

# Plot the data
plt.figure(figsize=(12, 6))
author_messages_per_hour.plot(kind='line', marker='o')
plt.xlabel('Hour of the Day')
plt.ylabel('Number of Messages')
plt.title(f'Number of Messages Sent by {specific_author} by Hour of the Day')
plt.xticks(range(24), [f'{i}' for i in range(24)])
plt.grid(True)
plt.show()

In [None]:
num_rows = len(combined_df)
print(f"The number of rows in the combined dataframe is: {num_rows}")

In [None]:
# Define the specific word you want to search for
specific_word = 'eg'

# Get the count of the specific word
specific_word_count = word_counts[specific_word]

print(f"The word '{specific_word}' has been written {specific_word_count} times.")# Define the specific word you want to search for
specific_word = 'jeg'

# Get the count of the specific word
specific_word_count = word_counts[specific_word]

print(f"The word '{specific_word}' has been written {specific_word_count} times.")

In [None]:
from collections import Counter
import re

# Combine all messages into a single string
all_messages = ' '.join(combined_df['message'].dropna())

# Use regular expressions to find all phrases (sequences of 2 or more words)
phrases = re.findall(r'\b\w+\b(?:\s+\b\w+\b)+', all_messages.lower())

# Count the frequency of each phrase
phrase_counts = Counter(phrases)

# Find the most common phrase
most_common_phrase, most_common_phrase_count = phrase_counts.most_common(1)[0]

print(f"The most common phrase is: '{most_common_phrase}' with {most_common_phrase_count} occurrences.")

In [None]:
# Find the most repeated message
most_repeated_message = combined_df['message'].value_counts().idxmax()
most_repeated_message_count = combined_df['message'].value_counts().max()

print(f"The most repeated message is: '{most_repeated_message}' with {most_repeated_message_count} occurrences.")

In [None]:
# Select entries that contain 'lol' in the message
lol_entries = combined_df[combined_df['message'].str.contains('lol', na=False, case=False)]

# Get the count of 'lol' messages sent by each author
lol_author_counts = lol_entries['author'].value_counts()

print(lol_author_counts)

In [None]:
from collections import defaultdict, Counter
import re

# Initialize a dictionary to hold word counts for each author
author_word_counts = defaultdict(Counter)

# Iterate over each row in the dataframe
for index, row in combined_df.iterrows():
    author = row['author']
    message = row['message']
    if pd.notna(message):
        # Use regular expressions to find all words in the message
        words = re.findall(r'\b\w+\b', message.lower())
        # Update the word count for the author
        author_word_counts[author].update(words)

# Find the most common words for each author
most_common_words_per_author = {author: word_counts.most_common(10) for author, word_counts in author_word_counts.items()}

# Print the most common words for each author
for author, common_words in most_common_words_per_author.items():
    print(f"Most common words for {author}:")
    for word, count in common_words:
        print(f"{word}: {count}")
    print()

In [None]:
# Get the count of messages for each channel
channel_message_counts = combined_df['channel'].value_counts()

# Print the message count for each channel
print(channel_message_counts)