In [2]:
!python -m spacy download pl_core_news_lg

Collecting pl-core-news-lg==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pl_core_news_lg-3.7.0/pl_core_news_lg-3.7.0-py3-none-any.whl (573.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m573.7/573.7 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pl-core-news-lg
Successfully installed pl-core-news-lg-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pl_core_news_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import spacy
import re

# Load spaCy model
nlp = spacy.load('pl_core_news_lg')


In [11]:
def clean_data(df):
    """
    Part 1 - Processing and cleaning the data:
    - Convert weekdays to full names
    - Convert months to numbers
    - Extract links
    - Remove stopwords
    - Add a clean date column in DD.MM.YYYY format
    """
    df = df.copy()

    # Task 1.1: Replace weekday abbreviations
    weekday_map = {
        'Mon': 'Monday', 'Tue': 'Tuesday', 'Wed': 'Wednesday',
        'Thu': 'Thursday', 'Fri': 'Friday', 'Sat': 'Saturday', 'Sun': 'Sunday'
    }
    df['weekday'] = df['created_at'].str.split().str[0].map(weekday_map)

    # Task 1.2: Replace month abbreviations with numbers
    month_map = {
        'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05',
        'Jun': '06', 'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10',
        'Nov': '11', 'Dec': '12'
    }

    def convert_date(date_str):
        if pd.isna(date_str):
            return date_str
        parts = date_str.split()
        if len(parts) >= 2:
            parts[1] = month_map.get(parts[1], parts[1])
        return ' '.join(parts)

    df['user_created_at_converted'] = df['user_created_at'].apply(convert_date)

    # Add a clean date column in DD.MM.YYYY format
    df['clean_date'] = pd.to_datetime(df['user_created_at'], format='%a %b %d %H:%M:%S %z %Y', errors='coerce')
    df['clean_date'] = df['clean_date'].dt.strftime('%d.%m.%Y')

    # Tasks 1.3, 1.4, 1.5: Extract links to lists
    tweet_links = df['tweet_url'].dropna().tolist()
    url_links = df['urls'].dropna().tolist()
    media_links = df['media'].dropna().tolist()

    # Task 1.6: Remove stopwords
    def remove_stopwords(text):
        if pd.isna(text):
            return ''
        doc = nlp(str(text))
        return ' '.join([token.text for token in doc if not token.is_stop])

    df['text_without_stopwords'] = df['text'].apply(remove_stopwords)

    # Save links to files
    with open('tweet_links.txt', 'w', encoding='utf-8') as f:
        f.write('\n'.join(tweet_links))
    with open('url_links.txt', 'w', encoding='utf-8') as f:
        f.write('\n'.join(url_links))
    with open('media_links.txt', 'w', encoding='utf-8') as f:
        f.write('\n'.join(media_links))

    return df

In [12]:

def analyze_data(df):
    """
    Part 2 - Exploratory data analysis:
    Generate all required analysis results
    """
    results = {
        # Task 2.1: Top 5 by likes
        'top_likes': df.nlargest(5, 'favorite_count')[['text', 'favorite_count']],
        # Task 2.2: Top 5 by retweets
        'top_retweets': df.nlargest(5, 'retweet_count')[['text', 'retweet_count']],
        # Task 2.3: Non-sensitive tweets
        'non_sensitive': df[df['possibly_sensitive'] == 'FALSE'],
        # Task 2.4: Earliest account tweets
        'earliest_user_tweets': df[df['user_created_at'] == df['user_created_at'].min()],
        # Task 2.5: Most followed user tweets
        'most_followed_tweets': df[df['user_followers_count'] == df['user_followers_count'].max()],
        # Task 2.6: Verified users
        'verified_users': df[df['user_verified'] == True],
        # Task 2.7: Most common day
        'most_common_day': df['created_at'].str.split().str[0].mode()[0]
    }

    # Save analysis results
    pd.DataFrame(results['top_likes']).to_csv('top_likes.csv', index=False)
    pd.DataFrame(results['top_retweets']).to_csv('top_retweets.csv', index=False)
    pd.DataFrame(results['earliest_user_tweets']).to_csv('earliest_user_tweets.csv', index=False)
    pd.DataFrame(results['most_followed_tweets']).to_csv('most_followed_tweets.csv', index=False)
    pd.DataFrame(results['verified_users']).to_csv('verified_users.csv', index=False)

    return results



In [13]:
def process_nlp(df):
    """
    Part 3 - Natural language processing:
    Extract entities and create required columns
    """
    # Load Polish model
    nlp = spacy.load('pl_core_news_lg')

    def extract_entities(text):
        doc = nlp(str(text))

        # Task 3.1: Extract persons
        persons = []
        # Task 3.2: Extract places
        places = []
        # Task 3.3: Extract organizations
        orgs = []

        # First pass - using spaCy's NER
        for ent in doc.ents:
            if ent.label_ in ['persName', 'PERSON']:
                persons.append(ent.text)
            elif ent.label_ in ['placeName', 'GPE', 'LOC']:
                places.append(ent.text)
            elif ent.label_ in ['orgName', 'ORG']:
                orgs.append(ent.text)

        # Second pass - custom rules for organizations
        words = text.split()
        for i, word in enumerate(words):
            if word[0].isupper() and i < len(words)-1:
                phrase = word
                next_word = words[i+1]
                if next_word[0].isupper() or next_word.lower() in ['m.st.', 'sp.', 'z.o.o', 'sa']:
                    phrase += ' ' + next_word
                    if any(org_word in phrase for org_word in ['Straż', 'Urząd', 'Komitet', 'Fundacja']):
                        orgs.append(phrase)

        return persons, places, orgs

    # Create new columns for entities
    df[['persons', 'places', 'organisations']] = pd.DataFrame(
        df['text'].apply(extract_entities).tolist(),
        index=df.index
    )

    return df

In [14]:
def create_weekday_plot(df):
    """
    Part 4 - Visualization:
    Create matplotlib graph of tweets per weekday
    """
    plt.figure(figsize=(10, 6))
    weekday_counts = df['created_at'].str.split().str[0].value_counts()
    weekday_order = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
    weekday_counts = weekday_counts.reindex(weekday_order)

    plt.bar(weekday_counts.index, weekday_counts.values)
    plt.title('Number of Tweets per Day of the Week')
    plt.xlabel('Day of the Week')
    plt.ylabel('Number of Tweets')
    plt.xticks(rotation=45)
    plt.tight_layout()

    # Save plot
    plt.savefig('tweets_per_day.png')
    plt.close()



In [15]:
# Main execution
print("Starting data analysis...")

# Read data
df = pd.read_csv('dane1.csv')

# Part 1: Clean data
print("\nPart 1: Cleaning data...")
cleaned_df = clean_data(df)

# Part 2: Analysis
print("\nPart 2: Performing analysis...")
analysis_results = analyze_data(cleaned_df)

# Part 3: NLP
print("\nPart 3: Processing text with NLP...")
processed_df = process_nlp(cleaned_df)

# Save final processed data with all new columns
processed_df.to_csv('processed_tweets.csv', index=False)

# Part 4: Visualization
print("\nPart 4: Creating visualization...")
create_weekday_plot(processed_df)

# Display results
print("\nResults preview:")
print("\nSample of extracted entities:")
print(processed_df[['text', 'persons', 'places', 'organisations']].head())

print("\nTop 5 liked tweets:")
print(analysis_results['top_likes'])

print("\nTop 5 retweeted tweets:")
print(analysis_results['top_retweets'])

print("\nMost common day for tweets:", analysis_results['most_common_day'])

print("\nAnalysis complete. Check the following output files:")
print("1. processed_tweets.csv - Complete processed dataset")
print("2. tweet_links.txt - List of tweet URLs")
print("3. url_links.txt - List of URLs from tweets")
print("4. media_links.txt - List of media links")
print("5. top_likes.csv - Top liked tweets")
print("6. top_retweets.csv - Top retweeted tweets")
print("7. earliest_user_tweets.csv - Tweets from earliest user")
print("8. most_followed_tweets.csv - Tweets from most followed user")
print("9. verified_users.csv - Tweets from verified users")
print("10. tweets_per_day.png - Visualization of tweet frequency")

Starting data analysis...

Part 1: Cleaning data...

Part 2: Performing analysis...

Part 3: Processing text with NLP...

Part 4: Creating visualization...

Results preview:

Sample of extracted entities:
                                                text                  persons  \
0  @beata_skwarska Warszawa 😀 https://t.co/W7BcyS...                       []   
1  Nieznani sprawcy podpalili kapliczkę nadrzewną...                       []   
2  ⚠️ Utrudnienia w komunikacji: L20 https://t.co...                       []   
3  @LukaszKohut @moanrosa @LincaAgata @jan_jozef_...                       []   
4  Dzieci to największy skarb, o który musimy dba...  [#DzieńDziecka, Franio]   

                            places organisations  
0      [@beata_skwarska, Warszawa]            []  
1         [warszawskim, Grochowie]            []  
2                               []            []  
3  [Warszawa, polskiego, Warschau]            []  
4                       [Warszawa]           [👧]  

T