diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000000..73f69e0958 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +# Editor-based HTTP Client requests +/httpRequests/ diff --git a/.idea/backend.iml b/.idea/backend.iml new file mode 100644 index 0000000000..d4c435b8d7 --- /dev/null +++ b/.idea/backend.iml @@ -0,0 +1,11 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/externalDependencies.xml b/.idea/externalDependencies.xml new file mode 100644 index 0000000000..7872ffbcf2 --- /dev/null +++ b/.idea/externalDependencies.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000000..105ce2da2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000000..dad652dad8 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000000..e066844ef6 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000000..5f0367233c --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/apps/.idea/.gitignore b/apps/.idea/.gitignore new file mode 100644 index 0000000000..73f69e0958 --- /dev/null +++ b/apps/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +# Editor-based HTTP Client requests +/httpRequests/ diff --git a/apps/.idea/apps.iml b/apps/.idea/apps.iml new file mode 100644 index 0000000000..4671b74793 --- /dev/null +++ b/apps/.idea/apps.iml @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/apps/.idea/inspectionProfiles/profiles_settings.xml b/apps/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000000..105ce2da2d --- /dev/null +++ b/apps/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/apps/.idea/misc.xml b/apps/.idea/misc.xml new file mode 100644 index 0000000000..65531ca992 --- /dev/null +++ b/apps/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/apps/.idea/modules.xml b/apps/.idea/modules.xml new file mode 100644 index 0000000000..44330cbd0c --- /dev/null +++ b/apps/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/apps/.idea/vcs.xml b/apps/.idea/vcs.xml new file mode 100644 index 0000000000..dc68028ae1 --- /dev/null +++ b/apps/.idea/vcs.xml @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/apps/common/src/perl/MediaWords/Util/Mail/Message/Templates/email-templates b/apps/common/src/perl/MediaWords/Util/Mail/Message/Templates/email-templates index 400fcc5911..72917269b9 160000 --- a/apps/common/src/perl/MediaWords/Util/Mail/Message/Templates/email-templates +++ b/apps/common/src/perl/MediaWords/Util/Mail/Message/Templates/email-templates @@ -1 +1 @@ -Subproject commit 400fcc5911743ec7d544bf2cfff69926860ecbb8 +Subproject commit 72917269b93577b61dfb93637b8177ccd1fd9448 diff --git a/apps/common/tests/data/html-strip/strip.html b/apps/common/tests/data/html-strip/strip.html deleted file mode 100644 index b3f2a82d8e..0000000000 --- a/apps/common/tests/data/html-strip/strip.html +++ /dev/null @@ -1,2757 +0,0 @@ - - - - - FBI Investigation Helps Uncover Latest Bribery Scandal In Greece · Global Voices - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
Close

Support Global Voices

-

To stay independent, free, and sustainable, our community needs the help of friends and readers like you.

-

Donate now »

-
-
- -
-
-
-

See all those languages up there? We translate Global Voices stories to make the world's citizen media available to everyone.

- Learn more about Lingua Translation  » - -
-
-
-
-
- -
- -
-
-
-
- -
-

FBI Investigation Helps Uncover Latest Bribery Scandal In Greece

- -
-
-
- - -
-
- -
-
-
-
-

Bribery. Image from Pixabay, CC0.

-

In the last few weeks, Greece has witnessed an increasingly turbulent - political scene. Initially, a large part of the current political -events was due to the re-emerging national debate about the FYROM/Macedonia naming dispute; however, - the latest turmoil has been triggered by a financial scandal involving -the pharmaceutical colossus Novartis and alleged bribes to political -officials.

-

On 6 February 2018, Parliament received allegations against the -pharmaceutical company which were made by anonymous witnesses in the -case. According to the earliest known facts, there are indications of -“bribes” which tallied up to 50 million euros and involved 10 -high-ranking political figures from the previous Greek administration in - a total of more than 30 people, including former general secretaries, -political counselors and former directors of government agencies. The -loss for the Greek state is estimated to exceed 3 billion euros for the period between 2000-2015.

-

Τhe scandal has already been dubbed a “megaton bomb“. Minister of Justice Stavros Kontonis has called it “the biggest scandal since the creation of the modern Greek state“, even surpassing the Siemens Case, - a corruption and bribery scandal regarding -security systems and the purchase of state contracts in the 1990s.

-

Novartis and the current investigation

-

Novartis Building in Basel, Switzerland. Wikimedia Commons, CC BY 2.0

-

Novartis, a multinational healthcare company based in Switzerland, is - at the epicenter of a global investigation about alleged illegal -payments to state officials which led to unfair competition practices -and established its leading position in the global market.

-

Since 2014, the U.S. Securities and Exchange Commission and the FBI have been investigating and discovering numerous company malpractices - — allegedly applied in Greece as well. These malpractices include -bribes to politicians, state officials associated with the Ministry of -Health, doctors and hospital staff. Doctors would allegedly promote and -prescribe Novartis’ products which would sometimes cost much more than -their competition.

-

As pointed out in several media outlets over the last few days:

- -

Greece - is a country of special interest for Novartis; the price of a product -in Greece would influence its price worldwide. An increase of 10 cents -in Greece would bring a 25 million profit for the company.

-

Novartis had played a crucial role in promoting ophthalmologic and -oncological drugs for cancer patients as well as high numbers of -vaccines against the H1N1 flu virus with significant financial gain for -the company. As indicated in a 2009 article by news portal Euro2Day:

- -

The stock of the company is currently recording a 3.16% rise.

-

Twitter user NikosBovolos now sarcastically wonders:

- -

Who could imagine that we would see -scandals in the Health sector, when [ex-Ministry of Health and now -accused] Mr. Avramopoulos ordered 16 million vaccines for a country with - a population of 10 million?

-

Reactions from the Greek political scene

-

After news of this latest scandal hit the press, social media outlets - started getting flooded with reactions, comments and memes by netizens -under the hashtags #novartis or #Novartis_gate. There - have also been reactions offline in response to the allegations by the -accused prominent political figures themselves.

-

Among the first questions raised about the credibility of this latest news was the convenient timing of its revelation. The - anonymous protected witness testimonies were given last November and -the last one was given on February 4th, at the peak of the -“Macedonia/FYROM talks”. The news about the scandal gained more traction - around the same time a massive rally took place in Athens which -protested the use of the term “Macedonia” as an official state name by -neighboring FYROM. As the current scandal takes over headlines, some -feel that this might be a tactic to distract from this pressing -political issue affecting the current administration:

- -

See how easily agenda and headlines are shifted and the rally just fades out

- -

A “Novartis” [case] a day keeps “Macedonia” [case] away

- -

Even if you reveal 100 Novartis [cases], Macedonia is ONE and is Greek

-

In addition, a crucial issue for the Prosecutor of the Corruption -Bureau is preserving the anonymity of the protected witnesses — two men -and a woman — since they have allegedly expressed fear for their own -lives. And not unreasonably.

-

One of the accused Greek politicians, Evangelos Venizelos, former -Deputy Prime Minister and current Minister in various departments for -the PASOK party, asked for the witnesses’ protection to be canceled so that anyone can sue them for slander or perjury. Later, he made comments which many felt threatened the witnesses.

- -

Evangelos Venizelos for the Novartis scandal witnesses: “…poor people, they believe they will be protected forever”.

-

Maybe the vilest phrase said by a politician at the 21st century. And the competition is rather not insignificant…

-

Marios Salmas, the former Deputy Minister of Health for the New -Democracy party, also accused of receiving illegal payments by Novartis, - appeared live in a TV broadcast and also threatened to punish the -case's protected witnesses. According to the feature video in an article by the Tribune news portal, he insists:

- -

I - only say one thing. Those witnesses that testified here — even if they -got new identities — will be found and will be punished. Greece is a -small country.

-

Twitter user @Kyria_Katy acutely mourned:

- -

I live in the Country, where an ex Minister of Health of remarkable nerve threatens protected witnesses in a live TV broadcast.

-

And user @Paralogistis exclaimed:

- -

Greece, the only country in the planet where the ones judged are the witnesses.

-

[The image reads]:
-Novartis Scandal.
-Ev. Venizelos: Do the witnesses think they will be forever protected?
-M. Salmas: The protected witnesses will be found and punished!

-

A public debate has also been initiated regarding whether or not the -allegations made in the testimonies have any base or are simply an -attempt to tarnish prominent political figures from both the opposition -ex-ruling parties of PASOK and New Democracy, as German business newspaper Handelsblatt notes.

-

The ten prominent politicians, two ex-Prime Ministers and eight -ex-Ministers alleged to be involved in the scandal, have denounced the -accusations against them. Some of them even threatened to react with -lawsuits, calling the case “political targeting” and “bullying”. Ex-Prime Minister Antonis Samaras declared that he - will press charges against Prime Minister Alexis Tsipras and against -Dimitris Papaggelopoulos, ex-Director General of the National -Intelligence Service and current Minister of Justice.

-

Another accused MP, Adonis Georgiadis, ex-Minister of Health for the -New Democracy party, known for his controversial statements, expressed -his belief that the whole scandal is “set and directed” by the Alternate Minister of Health Pavlos Polakis, who was implied to be an FBI agent.

-

Mr. Polakis himself answered via his Facebook page with a hilarious meme he had received from netizens:

-

-

Adonis, you are right!!!! I even received my ID card !!!!
-Haha, I just died laughing, I was sent this not long ago!!!

-

The main argument against the accusations of this case is that the -alleged perpetrators claim they cannot be brought to justice due to a -statute of limitations.

- -

How much nerve you must have, when -you tweet a news story saying that the Novartis scandal offenses – taken - place during your term as a Minister – have been time-barred! As if he -even had honor and lost it!

- -

Is it possible that the official -argumental line of New Democracy party about #Novartis_Gate focuses at -the possibility of time-barred offenses? Seriously now?

-

This scandal is unfolding slowly and has been shaking the whole Greek - political and social scene. However, there are doubts about whether or -not any real results will come of the case, as many feel that Greece has - a bad history of bringing “real” justice to similar cases. Let's see -what the repercussions will be in the following days and months.

- -
- - - - -
-
- - - - - - - - -
-
- - -

- Start the conversation

- - -
- -

Authors, please log in »

- -

-

- -

-

- -

-

- - - - - - -

- -

Guidelines

-
    -
  • All comments are reviewed by a moderator. Do not submit your comment more than once or it may be identified as spam.
  • - -
  • Please treat others with respect. Comments containing hate speech, obscenity, and personal attacks will not be approved.
  • -
- -

- - - -

-

-
-
- -
-
- -
-
- -
-
- -
- -
-
-
- - - - - - - - - - -
- -
-

Receive great stories from around the world directly in your inbox.

-

- -
-
-
- - - - - -
- -
-
- - -
- - -
-
- - -
-
- - -
-
- - -
-
- - -
-
- - -
-
- - -
-
- - -
-
- - -
-
- - -
-
- - -
-
- - -
-
- - -
-
- - -
-
- - -
-
- - -
-
- - -
-
- - -
-
- - -
-
- - -
-
- - -
-
- - -
-
- - -
- * = required field
- -
- Email Frequency
-
- - - - - -
- - -
- - -
-
- - - - - - - -
- Sign up to receive the best of Global Voices
-
-
-
- -No thanks, show me the site
- -
- - - - - - - - - - - - - - - - - -
\ No newline at end of file diff --git a/apps/common/tests/perl/MediaWords/DBI/Media.t b/apps/common/tests/perl/MediaWords/DBI/Media.t deleted file mode 100644 index 3f578e3c18..0000000000 --- a/apps/common/tests/perl/MediaWords/DBI/Media.t +++ /dev/null @@ -1,110 +0,0 @@ -use strict; -use warnings; - -# tests for MediaWords::DBI::Media - -use Readonly; -use Test::More; - -use MediaWords::DB; -use MediaWords::DBI::Media; -use MediaWords::Test::DB::Create; - -# test that medium_is_ready_for_analysis returns false when there are few than 100 stories and they are recent -sub test_few_recent_stories($) -{ - my ( $db ) = @_; - - my $label = 'few recent stories'; - - my $test_stack = - MediaWords::Test::DB::Create::create_test_story_stack( $db, { "$label medium" => { "feed" => [ 'story' ] } } ); - - my $medium = $test_stack->{ "$label medium" }; - - $db->query( "update feeds set active = 't' where media_id = \$1", $medium->{ media_id } ); - - $db->query( "update stories set collect_date = now() where media_id = \$1", $medium->{ media_id } ); - - ok( !MediaWords::DBI::Media::medium_is_ready_for_analysis( $db, $medium ), $label ); -} - -# test that medium_is_ready_for_analysis returns false when there is a single story if that story is old -sub test_few_old_stories($) -{ - my ( $db ) = @_; - - my $label = 'few old stories'; - - my $test_stack = - MediaWords::Test::DB::Create::create_test_story_stack( $db, { "$label medium" => { "feed" => [ 'story' ] } } ); - - my $medium = $test_stack->{ "$label medium" }; - - $db->query( "update feeds set active = 't' where media_id = \$1", $medium->{ media_id } ); - - $db->query( <{ media_id } ); -update stories set publish_date = now() - '1 year'::interval where media_id = ? -SQL - - ok( !MediaWords::DBI::Media::medium_is_ready_for_analysis( $db, $medium ), $label ); -} - -# test that the medium_is_ready_for_analysis returns false when there are no stories -sub test_no_stories($) -{ - my ( $db ) = @_; - - my $label = 'no stories'; - - my $test_stack = MediaWords::Test::DB::Create::create_test_story_stack( $db, { "$label medium" => { "feed" => [] } } ); - - my $medium = $test_stack->{ "$label medium" }; - - $db->query( "update feeds set active = 't' where media_id = \$1", $medium->{ media_id } ); - - ok( !MediaWords::DBI::Media::medium_is_ready_for_analysis( $db, $medium ), $label ); -} - -# test that the medium_is_ready_for_analysis returns false when there are few than 100 stories and -# there is no active feed -sub test_no_active_feed($) -{ - my ( $db ) = @_; - - my $label = 'no active feed'; - - my $test_stack = - MediaWords::Test::DB::Create::create_test_story_stack( $db, { "$label medium" => { "feed" => [ 'story' ] } } ); - - my $medium = $test_stack->{ "$label medium" }; - - $db->query( "update feeds set active = 'f' where media_id = \$1", $medium->{ media_id } ); - - $db->query( <{ media_id } ); -update stories set publish_date = now() - '1 year'::interval where media_id = ? -SQL - - ok( !MediaWords::DBI::Media::medium_is_ready_for_analysis( $db, $medium ), $label ); -} - -sub test_medium_is_ready_for_analysis -{ - my ( $db ) = @_; - - test_few_recent_stories( $db ); - test_few_old_stories( $db ); - test_no_active_feed( $db ); - test_no_stories( $db ); -} - -sub main -{ - my $db = MediaWords::DB::connect_to_db(); - - test_medium_is_ready_for_analysis( $db ); - - done_testing(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/DBI/Stories/AP.t b/apps/common/tests/perl/MediaWords/DBI/Stories/AP.t deleted file mode 100644 index a4904d91aa..0000000000 --- a/apps/common/tests/perl/MediaWords/DBI/Stories/AP.t +++ /dev/null @@ -1,137 +0,0 @@ -use strict; -use warnings; - -use Test::More tests => 13; -use Test::NoWarnings; - -use MediaWords::CommonLibs; -use MediaWords::DB; - -use Data::Dumper; -use MediaWords::Test::DB::Create; -use MediaWords::DBI::Stories::AP; - -my $_test_feed; - -sub _get_test_feed($) -{ - my ( $db ) = @_; - - if ( !$_test_feed ) - { - my $test_medium = MediaWords::Test::DB::Create::create_test_medium( $db, 'test' ); - $_test_feed = MediaWords::Test::DB::Create::create_test_feed( $db, 'test', $test_medium ); - } - - return $_test_feed; - -} - -sub test_story($$$$) -{ - my ( $db, $content, $expected, $label ) = @_; - - my $test_feed = _get_test_feed( $db ); - - my $story = MediaWords::Test::DB::Create::create_test_story( $db, $label, $test_feed ); - - $story->{ content } = $content; - - $story = MediaWords::Test::DB::Create::add_content_to_test_story( $db, $story, $test_feed ); - - my $got = MediaWords::DBI::Stories::AP::is_syndicated( $db, $content, $story->{ title }, $story->{ language } ); - - is( $got, $expected, "story is syndicated: $label" ); -} - -sub get_ap_sentences() -{ - return [ - 'AP sentence < 32.', - 'AP sentence >= 32 #1 (with some more text to pad out the length to 32).', - 'AP sentence >= 32 #2 (with some more text to pad out the length to 32).', - 'AP sentence >= 32 #3 (with some more text to pad out the length to 32).', - 'AP sentence >= 32 #4 (with some more text to pad out the length to 32).', - 'AP sentence >= 32 #5 (with some more text to pad out the length to 32).', - 'AP sentence >= 32 #6 (with some more text to pad out the length to 32).', - 'AP sentence >= 32 #7 (with some more text to pad out the length to 32).', - 'AP sentence >= 32 #8 (with some more text to pad out the length to 32).', - 'AP sentence >= 32 #9 (with some more text to pad out the length to 32).', - 'AP sentence >= 32 #10 (with some more text to pad out the length to 32).', - 'AP sentence >= 32 #11 (with some more text to pad out the length to 32).', - 'AP sentence >= 32 #12 (with some more text to pad out the length to 32).', - 'AP sentence >= 32 #13 (with some more text to pad out the length to 32).', - 'AP sentence >= 32 #14 (with some more text to pad out the length to 32).', - ]; -} - -# add ap medium and some content so that we can find dup sentences -sub add_ap_content($) -{ - my ( $db ) = @_; - - my $ap_medium = - MediaWords::Test::DB::Create::create_test_medium( $db, MediaWords::DBI::Stories::AP::get_ap_medium_name() ); - - my $feed = MediaWords::Test::DB::Create::create_test_feed( $db, 'feed', $ap_medium ); - - my $story = MediaWords::Test::DB::Create::create_test_story( $db, 'story', $feed ); - - $story->{ content } = join( "\n", @{ get_ap_sentences() } ); - - $story = MediaWords::Test::DB::Create::add_content_to_test_story( $db, $story, $feed ); -} - -sub test_ap_calls($) -{ - my ( $db ) = @_; - - add_ap_content( $db ); - - my $ap_sentences = get_ap_sentences(); - my $ap_content_single_16_sentence = [ grep { length( $_ ) < 32 } @{ $ap_sentences } ]->[ 0 ]; - my $ap_content_32_sentences = [ grep { length( $_ ) > 32 } @{ $ap_sentences } ]; - my $ap_content_single_32_sentence = $ap_content_32_sentences->[ 0 ]; - - test_story( $db, 'foo', 0, "simple unsyndicated story" ); - - test_story( $db, '(ap)', 1, "simple (ap) pattern" ); - - test_story( $db, "associated press", 0, "only associated press" ); - - test_story( $db, "'associated press'", 1, "quoted associated press" ); - - test_story( $db, <query( "select * from stories limit 1 offset 3" )->hash; - - { - MediaWords::DBI::Stories::GuessDate::assign_date_guess_method( $db, $story, 'undateable' ); - - my $got_tag = $db->query( <{ stories_id } )->hash; -select t.* from tags t join stories_tags_map stm using ( tags_id ) where stm.stories_id = ? -SQL - - is( $got_tag->{ tag }, 'undateable', "assign_date_guess_method: undateable" ); - } - { - MediaWords::DBI::Stories::GuessDate::assign_date_guess_method( $db, $story, 'foo bar / and ; baz' ); - - my $got_tag = $db->query( <{ stories_id } )->hash; -select t.* from tags t join stories_tags_map stm using ( tags_id ) where stm.stories_id = ? -SQL - - is( $got_tag->{ tag }, 'unknown', "assign_date_guess_method: unknown" ); - } -} - -sub main -{ - my $db = MediaWords::DB::connect_to_db(); - - test_assign_date_guess_method( $db ); - - done_testing(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Feed/Parse.t b/apps/common/tests/perl/MediaWords/Feed/Parse.t deleted file mode 100644 index caba8fcfae..0000000000 --- a/apps/common/tests/perl/MediaWords/Feed/Parse.t +++ /dev/null @@ -1,135 +0,0 @@ -use strict; -use warnings; -use utf8; - -use Modern::Perl "2015"; -use MediaWords::CommonLibs; - -use Test::NoWarnings; -use Test::More tests => 36; - -use_ok( 'MediaWords::Feed::Parse' ); - -use Data::Dumper; -use MediaWords::Test::URLs; - -sub _test_feed_contents($) -{ - my $feed_contents = shift; - - my $feed = MediaWords::Feed::Parse::parse_feed( $feed_contents ); - ok( $feed, "Unable to parse feed" ); - - is( $feed->title(), 'Test feed' ); - - my $items = $feed->items(); - is( scalar( @{ $items } ), 2 ); - - my $first_item = $items->[ 0 ]; - is( $first_item->title(), 'First item' ); - is( $first_item->link(), 'http://www.example.com/first_item.html' ); - is( $first_item->publish_date(), '2016-12-14T04:04:01Z' ); - - # publish_date_sql() is dependent on machine's timezone (which shouldn't be the case, but it is) - like( $first_item->publish_date_sql(), qr/2016-12-1\d \d\d:\d\d:\d\d/ ); - - is( $first_item->guid(), 'http://www.example.com/first_item.html' ); - is( $first_item->guid_if_valid(), 'http://www.example.com/first_item.html' ); - is( $first_item->description(), 'This is a first item.' ); - - my $second_item = $items->[ 1 ]; - is( $second_item->title(), 'ɯǝʇı puoɔǝS' ); - is( $second_item->link(), 'http://www.example.com/second_item.html' ); - is( $second_item->publish_date(), '2016-12-14T04:05:01Z' ); - - # publish_date_sql() is dependent on machine's timezone (which shouldn't be the case, but it is) - like( $second_item->publish_date_sql(), qr/2016-12-1\d \d\d:\d\d:\d\d/ ); - - is( $second_item->guid(), 'http://www.example.com/second_item.html' ); - is( $second_item->guid_if_valid(), 'http://www.example.com/second_item.html' ); - is( $second_item->description(), 'This is a second item.' ); -} - -sub test_rss_feed() -{ - my $rss_feed = < - - - Test feed - http://www.example.com/ - This is a test feed. - Wed, 14 Dec 2016 04:00:00 GMT - - First item - foobar\@example.com - http://www.example.com/first_item.html - Wed, 14 Dec 2016 04:04:01 GMT - http://www.example.com/first_item.html - This is a first item. - - - ɯǝʇı puoɔǝS - foobar\@example.com - http://www.example.com/second_item.html - Wed, 14 Dec 2016 04:05:01 GMT - http://www.example.com/second_item.html - - - - -XML - - _test_feed_contents( $rss_feed ); -} - -sub test_atom_feed() -{ - my $atom_feed = < - - - Test feed - This is a test feed. - - 2016-12-14T04:00:00Z - http://www.example.com/ - - First item - - http://www.example.com/first_item.html - - Foo Bar - - 2016-12-14T04:04:01Z - This is a first item. - - - ɯǝʇı puoɔǝS - - http://www.example.com/second_item.html - - Foo Bar - - 2016-12-14T04:05:01Z - - - -XML - - _test_feed_contents( $atom_feed ); -} - -sub main() -{ - # Test::More UTF-8 output - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - test_rss_feed(); - test_atom_feed(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Job/Broker b/apps/common/tests/perl/MediaWords/Job/Broker deleted file mode 120000 index 449f91ae57..0000000000 --- a/apps/common/tests/perl/MediaWords/Job/Broker +++ /dev/null @@ -1 +0,0 @@ -../../../python/mediawords/job/test_broker \ No newline at end of file diff --git a/apps/common/tests/perl/MediaWords/Job/Broker-fatal_error b/apps/common/tests/perl/MediaWords/Job/Broker-fatal_error deleted file mode 120000 index 9c7b48da17..0000000000 --- a/apps/common/tests/perl/MediaWords/Job/Broker-fatal_error +++ /dev/null @@ -1 +0,0 @@ -../../../python/mediawords/job/test_broker_fatal_error \ No newline at end of file diff --git a/apps/common/tests/perl/MediaWords/Job/Broker-fatal_error.t b/apps/common/tests/perl/MediaWords/Job/Broker-fatal_error.t deleted file mode 100644 index 972c117a98..0000000000 --- a/apps/common/tests/perl/MediaWords/Job/Broker-fatal_error.t +++ /dev/null @@ -1,102 +0,0 @@ -package BrokerTest::Test; - -use strict; -use warnings; - -use lib '/opt/mediacloud/tests/perl/MediaWords/Job/'; -use base qw(SetupBrokerTest); - -use Modern::Perl "2015"; -use MediaWords::CommonLibs; - -use Data::Dumper; -use Errno; -use Proc::ProcessTable; -use Test::More; - -use MediaWords::Job::Broker; - -local $| = 1; - - -sub worker_paths() -{ - my $workers_path = '/opt/mediacloud/tests/perl/MediaWords/Job/Broker-fatal_error'; - - return [ - { - 'queue_name' => 'TestPerlWorkerFatalError', - 'worker_path' => "$workers_path/perl_worker.pl", - }, - { - 'queue_name' => 'TestPythonWorkerFatalError', - 'worker_path' => "$workers_path/python_worker.py", - } - ]; -} - -sub broker_class() -{ - return 'MediaWords::Job::Broker'; -} - -sub _pid_exists($) -{ - my $pid = shift; - - say STDERR "Looking for PID $pid"; - - my $t = Proc::ProcessTable->new(); - - foreach my $process ( @{ $t->table } ) { - if ( $process->pid == $pid ) { - - say STDERR "Testing PID " . $process->pid . " with state " . $process->state; - - # Zombie processes don't count - if ( $process->state ne 'defunct' ) { - return 1; - } - } - } - - return 0; -} - -sub test_fatal_error : Test(no_plan) -{ - my $self = shift; - - INFO "Waiting for workers to start..."; - sleep( 5 ); - INFO "Done waiting"; - - for my $worker (@{ $self->{ WORKERS }}) { - - my $worker_pid = $worker->{ 'process_pids' }->[ 0 ]; - ok( _pid_exists( $worker_pid ), "PID $worker_pid is still running" ); - - $worker->{ 'app' }->add_to_queue(); - - for ( my $retry = 0; $retry < 20; ++$retry ) { - INFO "Waiting for the process $worker_pid to stop (retry $retry)..."; - if ( _pid_exists( $worker_pid ) ) { - sleep( 1 ); - } else { - INFO "Process stopped"; - last; - } - } - - ok( ! _pid_exists( $worker_pid ), "Process has managed to stop" ); - - # Not sure how to test exit code here - } -} - -sub main() -{ - Test::Class->runtests; -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Job/Broker-lock b/apps/common/tests/perl/MediaWords/Job/Broker-lock deleted file mode 120000 index 1c62248544..0000000000 --- a/apps/common/tests/perl/MediaWords/Job/Broker-lock +++ /dev/null @@ -1 +0,0 @@ -../../../python/mediawords/job/test_broker_lock \ No newline at end of file diff --git a/apps/common/tests/perl/MediaWords/Job/Broker-lock.t b/apps/common/tests/perl/MediaWords/Job/Broker-lock.t deleted file mode 100644 index 8ca9362c51..0000000000 --- a/apps/common/tests/perl/MediaWords/Job/Broker-lock.t +++ /dev/null @@ -1,78 +0,0 @@ -package BrokerTest::Test; - -use strict; -use warnings; - -use lib '/opt/mediacloud/tests/perl/MediaWords/Job/'; -use base qw(SetupBrokerTest); - -use Modern::Perl "2015"; -use MediaWords::CommonLibs; - -use Data::Dumper; -use Test::More; - -use MediaWords::Job::Broker; - -local $| = 1; - - -sub worker_paths() -{ - my $workers_path = '/opt/mediacloud/tests/perl/MediaWords/Job/Broker-lock'; - - # Need 2+ workers to see the effect of locking - my $worker_count = 2; - - return [ - { - 'queue_name' => 'TestPerlWorkerLock', - 'worker_path' => "$workers_path/perl_worker.pl", - 'worker_count' => $worker_count, - }, - { - 'queue_name' => 'TestPythonWorkerLock', - 'worker_path' => "$workers_path/python_worker.py", - 'worker_count' => $worker_count, - } - ]; -} - -sub broker_class() -{ - return 'MediaWords::Job::Broker'; -} - -sub test_lock : Test(no_plan) -{ - my $self = shift; - - my $lock_test_id = 123; - - for my $worker (@{ $self->{ WORKERS }}) { - - INFO "Adding the first job to the queue which will take 10+ seconds to run..."; - my $job_id = $worker->{ 'app' }->add_to_queue( { 'test_id' => $lock_test_id, 'x' => 2, 'y' => 3 } ); - - INFO "Waiting for the job to reach the queue..."; - sleep( 2 ); - - # While assuming that the first job is currently running (and thus is "locked"): - INFO "Testing if a subsequent job fails with a lock problem..."; - is( - $worker->{ 'app' }->run_remotely( { 'test_id' => $lock_test_id, 'x' => 3, 'y' => 4 } ), - undef, - "Second job shouldn't work", - ); - - INFO "Waiting for the first job to finish..."; - is( $worker->{ 'app' }->get_result( $job_id ), 5 ); - } -} - -sub main() -{ - Test::Class->runtests; -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Job/Broker-state b/apps/common/tests/perl/MediaWords/Job/Broker-state deleted file mode 120000 index f286a2ed3b..0000000000 --- a/apps/common/tests/perl/MediaWords/Job/Broker-state +++ /dev/null @@ -1 +0,0 @@ -../../../python/mediawords/job/test_broker_state \ No newline at end of file diff --git a/apps/common/tests/perl/MediaWords/Job/Broker-state.t b/apps/common/tests/perl/MediaWords/Job/Broker-state.t deleted file mode 100644 index fd359b43d0..0000000000 --- a/apps/common/tests/perl/MediaWords/Job/Broker-state.t +++ /dev/null @@ -1,210 +0,0 @@ -package BrokerTest::Test; - -use strict; -use warnings; - -use lib '/opt/mediacloud/tests/perl/MediaWords/Job/'; -use base qw(SetupBrokerTest); - -use Modern::Perl "2015"; -use MediaWords::CommonLibs; - -use Data::Dumper; -use Sys::Hostname; -use Test::More; - -use MediaWords::DB; -use MediaWords::Job::StatefulBroker; -use MediaWords::Job::State; -use MediaWords::Util::ParseJSON; - - -local $| = 1; - - -sub worker_paths() -{ - my $workers_path = '/opt/mediacloud/tests/perl/MediaWords/Job/Broker-state'; - - return [ - - { - 'queue_name' => 'TestPerlWorkerStateCompleted', - 'worker_path' => "$workers_path/perl_worker_completed.pl", - }, - { - 'queue_name' => 'TestPerlWorkerStateCustom', - 'worker_path' => "$workers_path/perl_worker_custom.pl", - }, - { - 'queue_name' => 'TestPerlWorkerStateError', - 'worker_path' => "$workers_path/perl_worker_error.pl", - }, - { - 'queue_name' => 'TestPerlWorkerStateRunning', - 'worker_path' => "$workers_path/perl_worker_running.pl", - }, - { - 'queue_name' => 'TestPerlWorkerStateQueued', - 'worker_path' => "$workers_path/perl_worker_queued.pl", - }, - - { - 'queue_name' => 'TestPythonWorkerStateCompleted', - 'worker_path' => "$workers_path/python_worker_completed.py", - }, - { - 'queue_name' => 'TestPythonWorkerStateCustom', - 'worker_path' => "$workers_path/python_worker_custom.py", - }, - { - 'queue_name' => 'TestPythonWorkerStateError', - 'worker_path' => "$workers_path/python_worker_error.py", - }, - { - 'queue_name' => 'TestPythonWorkerStateRunning', - 'worker_path' => "$workers_path/python_worker_running.py", - }, - { - 'queue_name' => 'TestPythonWorkerStateQueued', - 'worker_path' => "$workers_path/python_worker_queued.py", - }, - - ]; -} - -sub broker_class() -{ - return 'MediaWords::Job::StatefulBroker'; -} - -sub test_state : Test(no_plan) -{ - my $self = shift; - - my $db = MediaWords::DB::connect_to_db(); - - $db->query(<query("DELETE FROM job_states"); - $db->query("DELETE FROM test_job_states"); - - my $common_kwargs = { 'x' => 2, 'y' => 3 }; - my $expected_result = $common_kwargs->{ 'x' } + $common_kwargs->{ 'y' }; - - my $worker_types = [ - { - 'queue_name_ends_with' => 'Completed', - 'expected_result' => $expected_result, - 'expected_state' => $MediaWords::Job::State::STATE_COMPLETED, - 'expected_message' => '', - }, - { - 'queue_name_ends_with' => 'Custom', - 'expected_result' => undef, # never finishes - 'expected_state' => 'foo', - 'expected_message' => 'bar', - }, - { - 'queue_name_ends_with' => 'Error', - 'expected_result' => undef, # fails - 'expected_state' => $MediaWords::Job::State::STATE_ERROR, - 'expected_message' => "Well, it didn't work", - }, - { - 'queue_name_ends_with' => 'Running', - 'expected_result' => undef, # never finishes - 'expected_state' => $MediaWords::Job::State::STATE_RUNNING, - 'expected_message' => '', - }, - { - 'queue_name_ends_with' => 'Queued', - 'expected_result' => undef, # never starts - 'expected_state' => $MediaWords::Job::State::STATE_QUEUED, - 'expected_message' => '', - }, - ]; - - for my $worker_type ( @{ $worker_types } ) { - - my $applicable_workers = []; - for my $worker (@{ $self->{ WORKERS }}) { - my $queue_name_ends_with = $worker_type->{ 'queue_name_ends_with' }; - if ( $worker->{ 'app' }->queue_name() =~ m/\Q$queue_name_ends_with\E$/ ) { - push( @{ $applicable_workers }, $worker ); - } - } - - ok( scalar( @{ $applicable_workers }), "No workers found for type " . Dumper( $worker_type )); - - for my $worker (@{ $applicable_workers }) { - - $db->query( "DELETE FROM test_job_states" ); - - my $test_job_state = $db->insert( 'test_job_states', { - 'state' => '', - 'message' => '', - }); - my $test_job_states_id = $test_job_state->{ 'test_job_states_id' }; - - my $worker_args = { 'test_job_states_id' => $test_job_states_id }; - my $kwargs = { %{ $common_kwargs }, %{ $worker_args } }; - - my $job_id = $worker->{ 'app' }->add_to_queue( $kwargs ); - - if ( $worker_type->{ 'expected_result' } ) { - INFO "Fetching and comparing result for worker " . Dumper( $worker_type ); - my $result = $worker->{ 'app' }->get_result( $job_id ); - is( $result, $expected_result, "Result for worker " . Dumper( $worker_type )); - } else { - # Just wait a bit for the thing to finish - INFO "No result is expected, waiting for worker " . Dumper( $worker_type ); - sleep( 5 ); - } - - my $job_states = $db->query(<{ 'app' }->queue_name(), - )->hashes(); - is( scalar(@{ $job_states }), 1, "Job state count for worker " . Dumper( $worker_type ) ); - - my $job_state = $job_states->[ 0 ]; - - is( $job_state->{ 'state' }, $worker_type->{ 'expected_state' }, "Job state for worker " . Dumper( $worker_type ) . ", row: " . Dumper( $job_state ) ); - - if ( $worker_type->{ 'expected_state' } ne $MediaWords::Job::State::STATE_QUEUED ) { - my $expected_message = $worker_type->{ 'expected_message' }; - like( $job_state->{ 'message' }, qr/\Q$expected_message\E/, "Job message for worker " . Dumper( $worker_type ) ); - ok( $job_state->{ 'last_updated' }, "Job's last updated for worker " . Dumper( $worker_type ) ); - is_deeply( MediaWords::Util::ParseJSON::decode_json( $job_state->{ 'args' }), $kwargs, "Job's arguments for worker " . Dumper( $worker_type ) ); - is( $job_state->{ 'hostname' }, Sys::Hostname::hostname, "Job's hostname for worker " . Dumper( $worker_type ) ); - - my $custom_table_states = $db->select( 'test_job_states', '*' )->hashes(); - is( scalar( @{ $custom_table_states }), 1, "Custom table states count for worker " . Dumper( $worker_type ) ); - my $custom_table_state = $custom_table_states->[ 0 ]; - - is( $custom_table_state->{ 'state' }, $worker_type->{ 'expected_state' }, "Custom table state for worker " . Dumper( $worker_type ) . ", row: " . Dumper( $job_state ) ); - like( $custom_table_state->{ 'message' }, qr/\Q$expected_message\E/, "Custom table message for worker " . Dumper( $worker_type ) ); - } - } - - } -} - -sub main() -{ - Test::Class->runtests; -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Job/Broker.t b/apps/common/tests/perl/MediaWords/Job/Broker.t deleted file mode 100644 index e86a2f4346..0000000000 --- a/apps/common/tests/perl/MediaWords/Job/Broker.t +++ /dev/null @@ -1,70 +0,0 @@ -package BrokerTest::Test; - -use strict; -use warnings; - -use lib '/opt/mediacloud/tests/perl/MediaWords/Job/'; -use base qw(SetupBrokerTest); - -use Modern::Perl "2015"; -use MediaWords::CommonLibs; - -use Data::Dumper; -use Test::More; - -use MediaWords::Job::Broker; - -local $| = 1; - - -sub worker_paths() -{ - my $workers_path = '/opt/mediacloud/tests/perl/MediaWords/Job/Broker'; - - return [ - { - 'queue_name' => 'TestPerlWorker', - 'worker_path' => "$workers_path/perl_worker.pl", - }, - { - 'queue_name' => 'TestPythonWorker', - 'worker_path' => "$workers_path/python_worker.py", - } - ]; -} - -sub broker_class() -{ - return 'MediaWords::Job::Broker'; -} - -sub test_run_remotely : Test(no_plan) -{ - my $self = shift; - - for my $worker (@{ $self->{ WORKERS }}) { - my $result = $worker->{ 'app' }->run_remotely( { 'x' => 1, 'y' => 2 }); - is( $result, 3, "Result is correct for worker " . Dumper( $worker ) ); - } - -} - -sub test_add_to_queue_get_result : Test(no_plan) -{ - my $self = shift; - - for my $worker (@{ $self->{ WORKERS }}) { - my $job_id = $worker->{ 'app' }->add_to_queue( { 'x' => 3, 'y' => 4 }); - INFO "Job ID: $job_id for worker " . Dumper( $worker ); - - my $result = $worker->{ 'app' }->get_result( $job_id ); - is( $result, 7, "Result is correct for worker " . Dumper( $worker ) ); - } -} - -sub main() -{ - Test::Class->runtests; -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Job/SetupBrokerTest.pm b/apps/common/tests/perl/MediaWords/Job/SetupBrokerTest.pm deleted file mode 100644 index d65207a0ff..0000000000 --- a/apps/common/tests/perl/MediaWords/Job/SetupBrokerTest.pm +++ /dev/null @@ -1,92 +0,0 @@ -package SetupBrokerTest; - -use strict; -use warnings; -use base qw(Test::Class); - -use Test::More; - -use Modern::Perl "2015"; -use MediaWords::CommonLibs; - -use MediaWords::Job::Broker; -use MediaWords::Job::StatefulBroker; - -local $| = 1; - - -# Abstract method -sub worker_paths() -{ - LOGDIE "Abstract method."; -} - -# Abstract method -sub broker_class() -{ - LOGDIE "Abstract method."; -} - -sub start_workers : Test(startup) -{ - my $self = shift; - - DEBUG "Starting workers: " . Dumper( $self->worker_paths() ); - - $self->{ WORKERS } = []; - - for my $worker ( @{ $self->worker_paths() } ) { - ok( -f $worker->{ 'worker_path' }, "Worker script exists at " . $worker->{ 'worker_path' } ); - ok( -x $worker->{ 'worker_path' }, "Worker script is executable at " . $worker->{ 'worker_path' } ); - - my $broker_class = $self->broker_class(); - - my $worker_app = $broker_class->new( $worker->{ 'queue_name' } ); - DEBUG "Worker app: " . Dumper( $worker_app ); - - my $process_pids = []; - - unless ( defined $worker->{ 'worker_count' } ) { - $worker->{ 'worker_count' } = 1; - } - - ok( $worker->{ 'worker_count' }, "Worker count has to be positive" ); - - for ( my $x = 0; $x < $worker->{ 'worker_count' }; ++$x ) { - - my $worker_pid = fork(); - unless ( $worker_pid ) { - setpgrp(); - system( $worker->{ 'worker_path' } ); - exit( 0 ); - } else { - push( @{ $process_pids }, $worker_pid ); - } - } - - push( @{ $self->{ WORKERS } }, { - 'app' => $worker_app, - 'process_pids' => $process_pids, - } ); - } - - INFO "Waiting for workers to start..."; - sleep( 5 ); - INFO "Done waiting"; -} - -sub stop_workers : Test(shutdown) -{ - my $self = shift; - - INFO "Killing workers"; - - for my $worker (@{ $self->{ WORKERS }}) { - for my $pid (@{ $worker->{ process_pids }}) { - INFO "Killing worker with PID $pid"; - kill -9, getpgrp( $pid ); - } - } -} - -1; diff --git a/apps/common/tests/perl/MediaWords/KeyValueStore/AmazonS3.t b/apps/common/tests/perl/MediaWords/KeyValueStore/AmazonS3.t deleted file mode 100644 index 5ab668d72e..0000000000 --- a/apps/common/tests/perl/MediaWords/KeyValueStore/AmazonS3.t +++ /dev/null @@ -1,10 +0,0 @@ -use strict; -use warnings; - -use FindBin; -use MediaWords::KeyValueStore::AmazonS3; - -require "$FindBin::Bin/helpers/amazon_s3_tests.inc.pl"; - -my $s3_handler_class = 'MediaWords::KeyValueStore::AmazonS3'; -test_amazon_s3( $s3_handler_class ); diff --git a/apps/common/tests/perl/MediaWords/KeyValueStore/CachedAmazonS3.t b/apps/common/tests/perl/MediaWords/KeyValueStore/CachedAmazonS3.t deleted file mode 100644 index 053ceebaef..0000000000 --- a/apps/common/tests/perl/MediaWords/KeyValueStore/CachedAmazonS3.t +++ /dev/null @@ -1,12 +0,0 @@ -use strict; -use warnings; - -use FindBin; -use MediaWords::KeyValueStore::CachedAmazonS3; -use Readonly; - -require "$FindBin::Bin/helpers/amazon_s3_tests.inc.pl"; - -my $s3_handler_class = 'MediaWords::KeyValueStore::CachedAmazonS3'; -Readonly my $create_mock_download => 1; -test_amazon_s3( $s3_handler_class, $create_mock_download ); diff --git a/apps/common/tests/perl/MediaWords/KeyValueStore/MultipleStores.t b/apps/common/tests/perl/MediaWords/KeyValueStore/MultipleStores.t deleted file mode 100644 index 2fdd1bb207..0000000000 --- a/apps/common/tests/perl/MediaWords/KeyValueStore/MultipleStores.t +++ /dev/null @@ -1,42 +0,0 @@ -use strict; -use warnings; -use utf8; - -use Test::More; -use FindBin; - -use MediaWords::DB; -use MediaWords::KeyValueStore::AmazonS3; -use MediaWords::KeyValueStore::PostgreSQL; -use MediaWords::KeyValueStore::MultipleStores; -use MediaWords::Util::Config::Common; - -sub main() -{ - my $db = MediaWords::DB::connect_to_db(); - - my $postgresql = MediaWords::KeyValueStore::PostgreSQL->new( { table => 'raw_downloads' } ); - my $s3 = s3_download_handler( 'MediaWords::KeyValueStore::AmazonS3' ); - my $multiple_stores = MediaWords::KeyValueStore::MultipleStores->new( - { - stores_for_reading => [ $postgresql, $s3 ], - stores_for_writing => [ $postgresql, $s3 ], - } - ); - - test_postgresql( $db, $multiple_stores ); -} - -my $amazon_s3_downloads_config = MediaWords::Util::Config::Common::amazon_s3_downloads(); -unless ( defined( $amazon_s3_downloads_config->access_key_id() ) ) -{ - plan skip_all => 'Amazon S3\'s testing bucket is not configured'; -} -else -{ - require "$FindBin::Bin/helpers/amazon_s3_tests.inc.pl"; - require "$FindBin::Bin/helpers/postgresql_tests.inc.pl"; - - main(); -} - diff --git a/apps/common/tests/perl/MediaWords/KeyValueStore/PostgreSQL.t b/apps/common/tests/perl/MediaWords/KeyValueStore/PostgreSQL.t deleted file mode 100644 index 52fab94440..0000000000 --- a/apps/common/tests/perl/MediaWords/KeyValueStore/PostgreSQL.t +++ /dev/null @@ -1,16 +0,0 @@ -use strict; -use warnings; -use utf8; - -use FindBin; - -use MediaWords::DB; -use MediaWords::KeyValueStore::PostgreSQL; - -require "$FindBin::Bin/helpers/postgresql_tests.inc.pl"; - -my $db = MediaWords::DB::connect_to_db(); - -my $postgresql = MediaWords::KeyValueStore::PostgreSQL->new( { table => 'raw_downloads' } ); - -test_postgresql( $db, $postgresql ); diff --git a/apps/common/tests/perl/MediaWords/KeyValueStore/helpers/amazon_s3_tests.inc.pl b/apps/common/tests/perl/MediaWords/KeyValueStore/helpers/amazon_s3_tests.inc.pl deleted file mode 100644 index dc5bb00758..0000000000 --- a/apps/common/tests/perl/MediaWords/KeyValueStore/helpers/amazon_s3_tests.inc.pl +++ /dev/null @@ -1,137 +0,0 @@ -use strict; -use warnings; - -use Data::Dumper; -use Test::More; - -use MediaWords::DB; -use MediaWords::Util::Config::Common; -use MediaWords::Util::Text; - -sub s3_download_handler($) -{ - my $s3_handler_class = shift; - - my $s3_config = MediaWords::Util::Config::Common::amazon_s3_downloads(); - - # We want to be able to run S3 tests in parallel - my $test_suffix = '-' . MediaWords::Util::Text::random_string( 64 ); - my $directory_name = $s3_config->directory_name() . $test_suffix; - my $cache_table = 'cache.s3_raw_downloads_cache'; - - return $s3_handler_class->new( - { - access_key_id => $s3_config->access_key_id(), - secret_access_key => $s3_config->secret_access_key(), - bucket_name => $s3_config->bucket_name(), - directory_name => $directory_name, - - # Used only for CachedAmazonS3 - cache_table => $cache_table, - } - ); -} - -sub test_amazon_s3($;$) -{ - my ( $s3_handler_class, $create_mock_download ) = @_; - - my $s3_config = MediaWords::Util::Config::Common::amazon_s3_downloads(); - unless ( $s3_config->access_key_id() ) - { - plan skip_all => 'Amazon S3 is not configured'; - } - else - { - plan tests => 19; - } - - my $db = MediaWords::DB::connect_to_db(); - - my $test_downloads_id = 12345; - if ( $create_mock_download ) - { - require "$FindBin::Bin/helpers/create_mock_download.inc.pl"; - $test_downloads_id = create_mock_download( $db ); - } - - my $s3 = s3_download_handler( $s3_handler_class ); - ok( $s3, "Amazon S3 initialized" ); - - my $test_downloads_path = undef; - my $test_content = 'Loren ipsum dolor sit amet.'; - my $content; - - # - # Store content - # - - my $s3_path; - eval { $s3_path = $s3->store_content( $db, $test_downloads_id, $test_content ); }; - ok( ( !$@ ), "Storing content failed: $@" ); - ok( $s3_path, 'Object ID was returned' ); - like( $s3_path, qr#^s3:.+?/\Q$test_downloads_id\E$#, 'Object ID matches' ); - - # - # Fetch content, compare - # - - eval { $content = $s3->fetch_content( $db, $test_downloads_id, $test_downloads_path ); }; - ok( ( !$@ ), "Fetching download failed: $@" ); - ok( defined $content, "Fetching download did not die but no content was returned" ); - is( $content, $test_content, "Content doesn't match." ); - - # - # Remove content, try fetching again - # - - $s3->remove_content( $db, $test_downloads_id, $test_downloads_path ); - $content = undef; - eval { $content = $s3->fetch_content( $db, $test_downloads_id, $test_downloads_path ); }; - ok( $@, "Fetching download that does not exist should have failed" ); - ok( ( !defined $content ), - "Fetching download that does not exist failed (as expected) but the content was still returned" ); - - # - # Check if Amazon S3 thinks that the content exists - # - ok( - ( !$s3->content_exists( $db, $test_downloads_id, $test_downloads_path ) ), - "content_exists() reports that content exists (although it shouldn't)" - ); - - # - # Store content twice - # - - $s3_path = undef; - eval { - $s3_path = $s3->store_content( $db, $test_downloads_id, $test_content ); - $s3_path = $s3->store_content( $db, $test_downloads_id, $test_content ); - }; - ok( ( !$@ ), "Storing content twice failed: $@" ); - ok( $s3_path, 'Object ID was returned' ); - like( $s3_path, qr#^s3:.+?/\Q$test_downloads_id\E$#, 'Object ID matches' ); - - # Fetch content again, compare - eval { $content = $s3->fetch_content( $db, $test_downloads_id, $test_downloads_path ); }; - ok( ( !$@ ), "Fetching download failed: $@" ); - ok( defined $content, "Fetching download did not die but no content was returned" ); - is( $content, $test_content, "Content doesn't match." ); - - # Remove content, try fetching again - $s3->remove_content( $db, $test_downloads_id, $test_downloads_path ); - $content = undef; - eval { $content = $s3->fetch_content( $db, $test_downloads_id, $test_downloads_path ); }; - ok( $@, "Fetching download that does not exist should have failed" ); - ok( ( !defined $content ), - "Fetching download that does not exist failed (as expected) but the content was still returned" ); - - # Check if Amazon S3 thinks that the content exists - ok( - ( !$s3->content_exists( $db, $test_downloads_id, $test_downloads_path ) ), - "content_exists() reports that content exists (although it shouldn't)" - ); -} - -1; diff --git a/apps/common/tests/perl/MediaWords/KeyValueStore/helpers/create_mock_download.inc.pl b/apps/common/tests/perl/MediaWords/KeyValueStore/helpers/create_mock_download.inc.pl deleted file mode 100644 index 55df30b43d..0000000000 --- a/apps/common/tests/perl/MediaWords/KeyValueStore/helpers/create_mock_download.inc.pl +++ /dev/null @@ -1,45 +0,0 @@ -use strict; -use warnings; -use utf8; - -use Readonly; - -sub create_mock_download($$) -{ - my ( $db ) = @_; - - Readonly my $MOCK_DOWNLOADS_ID => 12345; - - $db->query( - <query( - <query( - <query( - < 20 + 1; - -use MediaWords::KeyValueStore::PostgreSQL; -use Data::Dumper; -use Readonly; - -require "$FindBin::Bin/helpers/create_mock_download.inc.pl"; - -BEGIN -{ - use_ok( 'MediaWords::DB' ); -} - -sub test_store_content($$$) -{ - my ( $db, $postgresql, $test_downloads_id ) = @_; - - my $test_downloads_path = undef; - my $test_content = 'Media Cloud - pnoןɔ ɐıpǝɯ'; # UTF-8 - my $content; - - # Store content - my $postgresql_id; - eval { $postgresql_id = $postgresql->store_content( $db, $test_downloads_id, $test_content ); }; - ok( ( !$@ ), "Storing content failed: $@" ); - ok( $postgresql_id, 'Object ID was returned' ); - ok( length( $postgresql_id ) > length( 'postgresql:' ), 'Object ID is of the valid size' ); - - # Fetch content - eval { $content = $postgresql->fetch_content( $db, $test_downloads_id, $test_downloads_path ); }; - ok( ( !$@ ), "Fetching download failed: $@" ); - ok( defined $content, "Fetching download did not die but no content was returned" ); - is( $content, $test_content, "Content doesn't match." ); - - # Check if PostgreSQL thinks that the content exists - ok( - $postgresql->content_exists( $db, $test_downloads_id, $test_downloads_path ), - "content_exists() reports that content doesn't exist (although it does)" - ); - - # Remove content, try fetching again - $postgresql->remove_content( $db, $test_downloads_id, $test_downloads_path ); - $content = undef; - eval { $content = $postgresql->fetch_content( $db, $test_downloads_id, $test_downloads_path ); }; - ok( $@, "Fetching download that does not exist should have failed" ); - ok( ( !defined $content ), - "Fetching download that does not exist failed (as expected) but the content was still returned" ); - - # Check if PostgreSQL thinks that the content exists - ok( - ( !$postgresql->content_exists( $db, $test_downloads_id, $test_downloads_path ) ), - "content_exists() reports that content exists (although it doesn't)" - ); -} - -sub test_store_content_twice($$$) -{ - my ( $db, $postgresql, $test_downloads_id ) = @_; - - my $test_downloads_path = undef; - my $test_content = 'Loren ipsum dolor sit amet.'; - my $content; - - # Store content - my $postgresql_id; - eval { - $postgresql_id = $postgresql->store_content( $db, $test_downloads_id, $test_content ); - $postgresql_id = $postgresql->store_content( $db, $test_downloads_id, $test_content ); - }; - ok( ( !$@ ), "Storing content failed: $@" ); - ok( $postgresql_id, 'Object ID was returned' ); - ok( length( $postgresql_id ) > length( 'postgresql:' ), 'Object ID is of the valid size' ); - - # Fetch content - eval { $content = $postgresql->fetch_content( $db, $test_downloads_id, $test_downloads_path ); }; - ok( ( !$@ ), "Fetching download failed: $@" ); - ok( defined $content, "Fetching download did not die but no content was returned" ); - is( $content, $test_content, "Content doesn't match." ); - - # Check if PostgreSQL thinks that the content exists - ok( - $postgresql->content_exists( $db, $test_downloads_id, $test_downloads_path ), - "content_exists() reports that content doesn't exist (although it does)" - ); - - # Remove content, try fetching again - $postgresql->remove_content( $db, $test_downloads_id, $test_downloads_path ); - $content = undef; - eval { $content = $postgresql->fetch_content( $db, $test_downloads_id, $test_downloads_path ); }; - ok( $@, "Fetching download that does not exist should have failed" ); - ok( ( !defined $content ), - "Fetching download that does not exist failed (as expected) but the content was still returned" ); - - # Check if PostgreSQL thinks that the content exists - ok( - ( !$postgresql->content_exists( $db, $test_downloads_id, $test_downloads_path ) ), - "content_exists() reports that content exists (although it doesn't)" - ); -} - -sub test_postgresql($$) -{ - my ( $db, $postgresql_handler ) = @_; - - # Errors might want to print out UTF-8 characters - binmode( STDERR, ':utf8' ); - binmode( STDOUT, ':utf8' ); - - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - my $test_downloads_id = create_mock_download( $db ); - - test_store_content( $db, $postgresql_handler, $test_downloads_id ); - test_store_content_twice( $db, $postgresql_handler, $test_downloads_id ); -} diff --git a/apps/common/tests/perl/MediaWords/Languages/Language.t b/apps/common/tests/perl/MediaWords/Languages/Language.t deleted file mode 100644 index 9919a8ae8d..0000000000 --- a/apps/common/tests/perl/MediaWords/Languages/Language.t +++ /dev/null @@ -1,59 +0,0 @@ -use strict; -use warnings; - -use Test::NoWarnings; -use Test::More tests => 13; - -use MediaWords::Languages::Language; - -use Data::Dumper; - -sub test_language_is_enabled() -{ - ok( MediaWords::Languages::Language::language_is_enabled( 'en' ) ); - ok( MediaWords::Languages::Language::language_is_enabled( 'lt' ) ); - - ok( !MediaWords::Languages::Language::language_is_enabled( undef ) ); - ok( !MediaWords::Languages::Language::language_is_enabled( '' ) ); - ok( !MediaWords::Languages::Language::language_is_enabled( 'xx' ) ); -} - -sub test_language_for_code() -{ - my $en = MediaWords::Languages::Language::language_for_code( 'en' ); - is( $en->language_code(), 'en' ); - - my $lt = MediaWords::Languages::Language::language_for_code( 'lt' ); - is( $lt->language_code(), 'lt' ); - - is( MediaWords::Languages::Language::language_for_code( undef ), undef ); - is( MediaWords::Languages::Language::language_for_code( '' ), undef ); - is( MediaWords::Languages::Language::language_for_code( 'xx' ), undef ); -} - -sub test_default_language_code() -{ - is( MediaWords::Languages::Language::default_language_code(), 'en' ); -} - -sub test_default_language() -{ - my $default_lang = MediaWords::Languages::Language::default_language(); - is( $default_lang->language_code(), 'en' ); -} - -sub main() -{ - # Test::More UTF-8 output - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - test_language_is_enabled(); - test_language_for_code(); - test_default_language_code(); - test_default_language(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Languages/ca.t b/apps/common/tests/perl/MediaWords/Languages/ca.t deleted file mode 100644 index 970e2d5fd0..0000000000 --- a/apps/common/tests/perl/MediaWords/Languages/ca.t +++ /dev/null @@ -1,71 +0,0 @@ -# -# Some test strings copied from Wikipedia (CC-BY-SA, http://creativecommons.org/licenses/by-sa/3.0/). -# - -use strict; -use warnings; - -use Readonly; - -use Test::NoWarnings; -use Test::More tests => 4; -use utf8; - -use MediaWords::Languages::ca; -use Data::Dumper; - -sub test_split_text_to_sentences() -{ - my $lang = MediaWords::Languages::ca->new(); - - my $test_string = <<'QUOTE'; -El Palau de la Música Catalana és un auditori de música situat al barri de Sant -Pere (Sant Pere, Santa Caterina i la Ribera) de Barcelona. Va ser projectat per -l'arquitecte barceloní Lluís Domènech i Montaner, un dels màxims representants -del modernisme català. -QUOTE - - my $expected_sentences = [ -"El Palau de la Música Catalana és un auditori de música situat al barri de Sant Pere (Sant Pere, Santa Caterina i la Ribera) de Barcelona.", -"Va ser projectat per l'arquitecte barceloní Lluís Domènech i Montaner, un dels màxims representants del modernisme català.", - ]; - - is( join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), join( '||', @{ $expected_sentences } ) ); -} - -sub test_tokenize() -{ - my $lang = MediaWords::Languages::ca->new(); - - my $input_string = -"Després del Brexit, es confirma el trasllat de l'Agència Europea de Medicaments i l'Autoritat Bancària Europea a Amsterdam i París, respectivament."; - my $expected_words = [ - "després", "del", "brexit", "es", "confirma", "el", - "trasllat", "de", "l'agència", "europea", "de", "medicaments", - "i", "l'autoritat", "bancària", "europea", "a", "amsterdam", - "i", "parís", "respectivament" - ]; - is_deeply( $lang->split_sentence_to_words( $input_string ), $expected_words ); -} - -sub test_stem() -{ - my $lang = MediaWords::Languages::ca->new(); - - is_deeply( $lang->stem_words( [ qw/El Palau de la Música Catalana/ ] ), [ qw/ el pal de la music catal / ] ); -} - -sub main() -{ - # Test::More UTF-8 output - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - test_split_text_to_sentences(); - test_tokenize(); - test_stem(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Languages/da.t b/apps/common/tests/perl/MediaWords/Languages/da.t deleted file mode 100644 index 8035c28c33..0000000000 --- a/apps/common/tests/perl/MediaWords/Languages/da.t +++ /dev/null @@ -1,90 +0,0 @@ -# -# Some test strings copied from Wikipedia (CC-BY-SA, http://creativecommons.org/licenses/by-sa/3.0/). -# - -use strict; -use warnings; - -use Readonly; - -use Test::NoWarnings; -use Test::More tests => 2 + 1; -use utf8; - -use MediaWords::Languages::da; -use Data::Dumper; - -sub test_split_text_to_sentences() -{ - my $test_string; - my $expected_sentences; - - my $lang = MediaWords::Languages::da->new(); - - # - # Simple paragraph - # - $test_string = <<'QUOTE'; -Sør-Georgia (engelsk: South Georgia) er ei øy i Søratlanteren som høyrer til det britiske oversjøiske territoriet -Sør-Georgia og Sør-Sandwichøyane. Argentina gjer krav på Sør-Georgia og resten av dei britiske territoria i -Søratlanteren. Sør-Georgia har eit areal på 3 756 km² og er 170 km lang og 30 km brei. Det høgste punktet på -øya er Mount Paget på 2 934 moh. I alt elleve fjelltoppar er høgare enn 2 000 moh. 75 % av øya er dekt av snø -og is. Det er meir enn 150 isbrear på øya, og Nordenskiöldbreen er den største. Øya har ingen fastbuande, men har -forskingspersonell som er tilknytte museumsdrifta og forskingsstasjonane på Birdøya og King Edward Point. -QUOTE - - $expected_sentences = [ -'Sør-Georgia (engelsk: South Georgia) er ei øy i Søratlanteren som høyrer til det britiske oversjøiske territoriet Sør-Georgia og Sør-Sandwichøyane.', - 'Argentina gjer krav på Sør-Georgia og resten av dei britiske territoria i Søratlanteren.', - 'Sør-Georgia har eit areal på 3 756 km² og er 170 km lang og 30 km brei.', - 'Det høgste punktet på øya er Mount Paget på 2 934 moh.', - 'I alt elleve fjelltoppar er høgare enn 2 000 moh.', - '75 % av øya er dekt av snø og is.', - 'Det er meir enn 150 isbrear på øya, og Nordenskiöldbreen er den største.', -'Øya har ingen fastbuande, men har forskingspersonell som er tilknytte museumsdrifta og forskingsstasjonane på Birdøya og King Edward Point.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Date ("14. januar 1776") - # - $test_string = <<'QUOTE'; -Sør-Georgia vart oppdaga av Antoine de la Roché i april 1675, fartøyet hans var kome ut av kurs på ein segltur -frå Lima i Peru til England. Øya vart på ny sett av spanjolen Gregorio Jerez i 1756. James Cook kom til -Sør-Georgia 14. januar 1776 og var den fyrste som gjekk i land på øya. -QUOTE - - $expected_sentences = [ -'Sør-Georgia vart oppdaga av Antoine de la Roché i april 1675, fartøyet hans var kome ut av kurs på ein segltur frå Lima i Peru til England.', - 'Øya vart på ny sett av spanjolen Gregorio Jerez i 1756.', - 'James Cook kom til Sør-Georgia 14. januar 1776 og var den fyrste som gjekk i land på øya.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } -} - -sub main() -{ - # Test::More UTF-8 output - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - test_split_text_to_sentences(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Languages/de.t b/apps/common/tests/perl/MediaWords/Languages/de.t deleted file mode 100644 index cf8d8d52b1..0000000000 --- a/apps/common/tests/perl/MediaWords/Languages/de.t +++ /dev/null @@ -1,65 +0,0 @@ -# -# Some test strings copied from Wikipedia (CC-BY-SA, http://creativecommons.org/licenses/by-sa/3.0/). -# - -use strict; -use warnings; - -use Readonly; - -use Test::NoWarnings; -use Test::More tests => 1 + 1; -use utf8; - -use MediaWords::Languages::de; -use Data::Dumper; - -sub test_split_text_to_sentences() -{ - my $test_string; - my $expected_sentences; - - my $lang = MediaWords::Languages::de->new(); - - # - # Simple paragraph + period in the middle of the date + period in the middle of the number - # - $test_string = <<'QUOTE'; -Das Black Album (deutsch: Schwarzes Album) ist das sechzehnte Studioalbum des US-amerikanischen Musikers -Prince. Es erschien am 22. November 1994 bei dem Label Warner Bros. Records. Prince hatte das Album -bereits während der Jahre 1986 und 1987 aufgenommen und Warner Bros. Records wollte es ursprünglich -am 8. Dezember 1987 veröffentlichen. Allerdings zog Prince das Album eine Woche vor dem geplanten -Veröffentlichungstermin ohne Angabe von Gründen zurück. Anschließend entwickelte es sich mit über -250.000 Exemplaren zu einem der meistverkauften Bootlegs der Musikgeschichte, bis es sieben Jahre später -offiziell veröffentlicht wurde. -QUOTE - - $expected_sentences = [ - 'Das Black Album (deutsch: Schwarzes Album) ist das sechzehnte Studioalbum des US-amerikanischen Musikers Prince.', - 'Es erschien am 22. November 1994 bei dem Label Warner Bros. Records.', -'Prince hatte das Album bereits während der Jahre 1986 und 1987 aufgenommen und Warner Bros. Records wollte es ursprünglich am 8. Dezember 1987 veröffentlichen.', -'Allerdings zog Prince das Album eine Woche vor dem geplanten Veröffentlichungstermin ohne Angabe von Gründen zurück.', -'Anschließend entwickelte es sich mit über 250.000 Exemplaren zu einem der meistverkauften Bootlegs der Musikgeschichte, bis es sieben Jahre später offiziell veröffentlicht wurde.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } -} - -sub main() -{ - # Test::More UTF-8 output - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - test_split_text_to_sentences(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Languages/en.t b/apps/common/tests/perl/MediaWords/Languages/en.t deleted file mode 100644 index 3e7788ad6b..0000000000 --- a/apps/common/tests/perl/MediaWords/Languages/en.t +++ /dev/null @@ -1,323 +0,0 @@ -use strict; -use warnings; - -use Readonly; - -use Test::NoWarnings; -use Test::More tests => 26; -use utf8; - -use MediaWords::Languages::en; -use Data::Dumper; - -sub test_stopwords() -{ - my $lang = MediaWords::Languages::en->new(); - - ok( $lang->stop_words_map() ); - - # Stop words - my $stop_words_en = $lang->stop_words_map(); - ok( scalar( keys( %{ $stop_words_en } ) ) >= 174, "stop words (en) count is correct" ); - - is( $stop_words_en->{ 'the' }, 1, "English test #1" ); - is( $stop_words_en->{ 'a' }, 1, "English test #2" ); - is( $stop_words_en->{ 'is' }, 1, "English test #3" ); -} - -sub test_split_text_to_sentences() -{ - my $test_string; - my $expected_sentences; - - my $lang = MediaWords::Languages::en->new(); - - # - # Period in number - # - $test_string = <<'QUOTE'; - Sentence contain version 2.0 of the text. Foo. -QUOTE - - $expected_sentences = [ 'Sentence contain version 2.0 of the text.', 'Foo.' ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # 'May' ending - # - $test_string = <<'QUOTE'; - Sentence ends in May. This is the next sentence. Foo. -QUOTE - - $expected_sentences = [ 'Sentence ends in May.', 'This is the next sentence.', 'Foo.' ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Punctuation - # - $test_string = <<'QUOTE'; - Leave the city! [Mega No!], l. -QUOTE - - $expected_sentences = [ 'Leave the city!', '[Mega No!], l.' ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Basic Unicode - # - $test_string = <<'QUOTE'; - Non Mega Não. -QUOTE - - $expected_sentences = [ 'Non Mega Não.' ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Unicode - # - $test_string = <<'QUOTE'; - Non Mega Não! [Mega No!], l. -QUOTE - - $expected_sentences = [ 'Non Mega Não!', '[Mega No!], l.', ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Quotation - # - $test_string = -"Perhaps that\x{2019}s the best thing the Nobel Committee did by awarding this year\x{2019}s literature prize to a non-dissident, someone whom Peter Englund of the Swedish Academy said was \x{201c}more a critic of the system, sitting within the system.\x{201d} They\x{2019}ve given him a chance to bust out."; - - $expected_sentences = [ -'Perhaps that’s the best thing the Nobel Committee did by awarding this year’s literature prize to a non-dissident, someone whom Peter Englund of the Swedish Academy said was “more a critic of the system, sitting within the system.”', - 'They’ve given him a chance to bust out.', - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # String whitespace trimming - # - $test_string = <<'QUOTE'; - In another demonstration of cyberactivism and acvistim, Brazilian Internet users are gathering around a cause: to fight Senator Azeredo's Digital Crimes Bill. This legal project, which intends to intervene severely in the way people use the Internet in Brazil is being heavily criticized by Brazil's academic field, left-wing parties and the Internet community. -QUOTE - - $expected_sentences = [ -'In another demonstration of cyberactivism and acvistim, Brazilian Internet users are gathering around a cause: to fight Senator Azeredo\'s Digital Crimes Bill.', -'This legal project, which intends to intervene severely in the way people use the Internet in Brazil is being heavily criticized by Brazil\'s academic field, left-wing parties and the Internet community.', - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Two spaces in the middle of the sentence - # - $test_string = <<'QUOTE'; - Although several opposition groups have called for boycotting the coming June 12 presidential election, it seems the weight of boycotting groups is much less than four years ago. -QUOTE - - $expected_sentences = [ -'Although several opposition groups have called for boycotting the coming June 12 presidential election, it seems the weight of boycotting groups is much less than four years ago.', - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Non-breaking space - # - $test_string = <<"QUOTE"; - American Current TV journalists Laura Ling and Euna Lee have been sentenced to 12 years of hard labor (according to CNN).  Jillian York rounded up blog posts for Global Voices prior to the journalists' sentencing. -QUOTE - - $expected_sentences = [ -'American Current TV journalists Laura Ling and Euna Lee have been sentenced to 12 years of hard labor (according to CNN).', - 'Jillian York rounded up blog posts for Global Voices prior to the journalists\' sentencing.', - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # No space after a period - # - $test_string = <<'QUOTE'; - Anger is a waste of energy and what North Korea wants of you.We can and will work together and use our minds, to work this through. -QUOTE - - $expected_sentences = [ - 'Anger is a waste of energy and what North Korea wants of you.', - 'We can and will work together and use our minds, to work this through.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Unicode's "…" - # - $test_string = <<'QUOTE'; - One of the most popular Brahmin community, with 28, 726 members, randomly claims: “we r clever & hardworking. no one can fool us…” The Brahmans community with 41952 members and the Brahmins of India community with 30588 members are also very popular. -QUOTE - - $expected_sentences = [ -'One of the most popular Brahmin community, with 28, 726 members, randomly claims: “we r clever & hardworking. no one can fool us...”', -'The Brahmans community with 41952 members and the Brahmins of India community with 30588 members are also very popular.', - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } -} - -sub test_tokenize() -{ - my ( $input_string, $expected_words ); - - my $lang = MediaWords::Languages::en->new(); - - # Normal apostrophe (') - $input_string = "It's always sunny in Philadelphia."; - $expected_words = [ "it's", "always", "sunny", "in", "philadelphia" ]; - is_deeply( $lang->split_sentence_to_words( $input_string ), $expected_words, 'Tokenization with normal apostrophe' ); - - # Right single quotation mark (’), normalized to apostrophe (') - $input_string = "It’s always sunny in Philadelphia."; - $expected_words = [ "it's", "always", "sunny", "in", "philadelphia" ]; - is_deeply( $lang->split_sentence_to_words( $input_string ), $expected_words, 'Tokenization with fancy apostrophe' ); - - # Hyphen without split - $input_string = "near-total secrecy"; - $expected_words = [ "near-total", "secrecy" ]; - is_deeply( $lang->split_sentence_to_words( $input_string ), $expected_words, 'Tokenization with hyphen (no split)' ); - - # Hyphen with split (where it's being used as a dash) - $input_string = "A Pythagorean triple - named for the ancient Greek Pythagoras"; - $expected_words = [ 'a', 'pythagorean', 'triple', 'named', 'for', 'the', 'ancient', 'greek', 'pythagoras' ]; - is_deeply( $lang->split_sentence_to_words( $input_string ), $expected_words, 'Tokenization with hyphen (split)' ); - - # Quotes - $input_string = 'it was in the Guinness Book of World Records as the "most difficult mathematical problem"'; - $expected_words = [ - 'it', 'was', 'in', 'the', 'guinness', 'book', 'of', 'world', - 'records', 'as', 'the', 'most', 'difficult', 'mathematical', 'problem' - ]; - is_deeply( $lang->split_sentence_to_words( $input_string ), $expected_words, 'Quote removal while tokenizing' ); -} - -sub test_stem() -{ - my $lang = MediaWords::Languages::en->new(); - - # from http://en.wikipedia.org/wiki/Stemming - my $split_words = [ - 'In', 'linguistic', 'morphology', 'stemming', 'is', 'the', 'process', 'for', - 'reducing', 'inflected', 'or', 'sometimes', 'derived', 'words', 'to', 'their', - 'stem', 'base', 'or', 'root', 'form', 'generally', 'a', 'written', - 'word', 'form', - ]; - - my $expected_stems = [ - 'in', 'linguist', 'morpholog', 'stem', 'is', 'the', 'process', 'for', - 'reduc', 'inflect', 'or', 'sometim', 'deriv', 'word', 'to', 'their', - 'stem', 'base', 'or', 'root', 'form', 'general', 'a', 'written', - 'word', 'form', - ]; - - my $stem_result = $lang->stem_words( $split_words ); - - is_deeply( $stem_result, $expected_stems, "Stemmer compare test" ); - - ok( length( $expected_stems ) > 0, "Stemmed text is nonempty" ); - - # Apostrophes - is_deeply( - $lang->stem_words( [ "Katz's", "Delicatessen" ] ), - [ 'katz', 'delicatessen' ], - 'Stemming with normal apostrophe' - ); - is_deeply( $lang->stem_words( [ "it’s", "toasted" ] ), [ 'it', 'toast' ], - 'Stemming with right single quotation mark' ); -} - -sub main() -{ - # Test::More UTF-8 output - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - test_stopwords(); - test_split_text_to_sentences(); - test_tokenize(); - test_stem(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Languages/es.t b/apps/common/tests/perl/MediaWords/Languages/es.t deleted file mode 100644 index 1010bd2265..0000000000 --- a/apps/common/tests/perl/MediaWords/Languages/es.t +++ /dev/null @@ -1,86 +0,0 @@ -# -# Some test strings copied from Wikipedia (CC-BY-SA, http://creativecommons.org/licenses/by-sa/3.0/). -# - -use strict; -use warnings; - -use Readonly; - -use Test::NoWarnings; -use Test::More tests => 2 + 1; -use utf8; - -use MediaWords::Languages::es; -use Data::Dumper; - -sub test_split_text_to_sentences() -{ - my $test_string; - my $expected_sentences; - - my $lang = MediaWords::Languages::es->new(); - - # - # Simple paragraph - # - $test_string = <<'QUOTE'; -El paracetamol (DCI) o acetaminofén (acetaminofeno) es un fármaco con propiedades analgésicas, -sin propiedades antiinflamatorias clínicamente significativas. Actúa inhibiendo la síntesis de -prostaglandinas, mediadores celulares responsables de la aparición del dolor. Además, tiene -efectos antipiréticos. Se presenta habitualmente en forma de cápsulas, comprimidos, supositorios -o gotas de administración oral. -QUOTE - - $expected_sentences = [ -'El paracetamol (DCI) o acetaminofén (acetaminofeno) es un fármaco con propiedades analgésicas, sin propiedades antiinflamatorias clínicamente significativas.', - 'Actúa inhibiendo la síntesis de prostaglandinas, mediadores celulares responsables de la aparición del dolor.', - 'Además, tiene efectos antipiréticos.', - 'Se presenta habitualmente en forma de cápsulas, comprimidos, supositorios o gotas de administración oral.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Period in the middle of the number - # - $test_string = <<'QUOTE'; -Esa misma noche el ministro de Defensa, Ehud Barak, consiguió el apoyo del gabinete israelí para ampliar -la movilización de reservistas de 30.000 a 75.000, de cara a una posible operación terrestre sobre la -Franja de Gaza. El ministro de Relaciones Exteriores Avigdor Lieberman, aclaró que el gobierno actual -no estaba considerando el derrocamiento del gobierno de Hamas en la Franja, y que lo tendría que decidir -el próximo gobierno. -QUOTE - - $expected_sentences = [ -'Esa misma noche el ministro de Defensa, Ehud Barak, consiguió el apoyo del gabinete israelí para ampliar la movilización de reservistas de 30.000 a 75.000, de cara a una posible operación terrestre sobre la Franja de Gaza.', -'El ministro de Relaciones Exteriores Avigdor Lieberman, aclaró que el gobierno actual no estaba considerando el derrocamiento del gobierno de Hamas en la Franja, y que lo tendría que decidir el próximo gobierno.', - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } -} - -sub main() -{ - # Test::More UTF-8 output - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - test_split_text_to_sentences(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Languages/fi.t b/apps/common/tests/perl/MediaWords/Languages/fi.t deleted file mode 100644 index 42407b361a..0000000000 --- a/apps/common/tests/perl/MediaWords/Languages/fi.t +++ /dev/null @@ -1,145 +0,0 @@ -# -# Some test strings copied from Wikipedia (CC-BY-SA, http://creativecommons.org/licenses/by-sa/3.0/). -# - -use strict; -use warnings; - -use Readonly; - -use Test::NoWarnings; -use Test::More tests => 4 + 1; -use utf8; - -use MediaWords::Languages::fi; -use Data::Dumper; - -sub test_split_text_to_sentences() -{ - my $test_string; - my $expected_sentences; - - my $lang = MediaWords::Languages::fi->new(); - - # - # Simple paragraph - # - $test_string = <<'QUOTE'; -Pallokalat (Tetraodontidae) on kalaheimo, johon kuuluu sekä koralliriutoilla, murtovedessä että makeassa -vedessä eläviä lajeja. Vuonna 2004 heimosta tunnettiin 187 lajia, joista jotkut elävät makeassa tai -murtovedessä, jotkut taas viettävät osan elämästään murto- ja osan merivedessä. Pallokalat ovat -saaneet nimensä siitä, että pelästyessään ne imevät itsensä täyteen vettä tai ilmaa ja pullistuvat -palloiksi. Toinen pullistelevien kalojen heimo on siilikalat. Pallokalat ovat terävähampaisia petoja, -jotka syövät muun muassa simpukoita, kotiloita ja muita kaloja. Pallokaloja voidaan pitää akvaariossa, -mutta hoitajan tulee olla perehtynyt niiden hoitoon hyvin. -QUOTE - - $expected_sentences = [ -'Pallokalat (Tetraodontidae) on kalaheimo, johon kuuluu sekä koralliriutoilla, murtovedessä että makeassa vedessä eläviä lajeja.', -'Vuonna 2004 heimosta tunnettiin 187 lajia, joista jotkut elävät makeassa tai murtovedessä, jotkut taas viettävät osan elämästään murto- ja osan merivedessä.', -'Pallokalat ovat saaneet nimensä siitä, että pelästyessään ne imevät itsensä täyteen vettä tai ilmaa ja pullistuvat palloiksi.', - 'Toinen pullistelevien kalojen heimo on siilikalat.', - 'Pallokalat ovat terävähampaisia petoja, jotka syövät muun muassa simpukoita, kotiloita ja muita kaloja.', - 'Pallokaloja voidaan pitää akvaariossa, mutta hoitajan tulee olla perehtynyt niiden hoitoon hyvin.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Number followed by a period - # - $test_string = <<'QUOTE'; -Katso Teiniäidit-sarjan 8. jakso ennakkoon. -QUOTE - - $expected_sentences = [ 'Katso Teiniäidit-sarjan 8. jakso ennakkoon.', ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Dates with a period ("31. tammikuuta", "1. helmikuuta") - # - $test_string = <<'QUOTE'; -Toisin kuin monissa muissa palkinnoissa, Nobel-palkinnon saajan valitseminen on pitkä prosessi. -Tämä on nostanut palkinnon arvokkuutta ja sen takia palkintoa pidetään alansa arvostetuimpana. -Nobel-komiteat lähettävät vuosittain tuhansille eri alojen tiedemiehille, eri organisaatioiden -ja akatemioiden jäsenille sekä edellisille Nobel-palkinnon saaneille viestin, jossa heitä -pyydetään asettamaan ehdokas seuraavaksi palkinnon saajaksi. Ehdolleasettajat pyritään -valitsemaan siten, että mahdollisimman monet yliopistot ympäri maailmaa saavat asettaa -ehdokkaita mahdollisimman tasa-arvoisesti. Vuosittain ehdolle asetetaan 200–300 henkilöä -(myös joitakin organisaatioita voidaan ehdottaa) kuhunkin palkintoryhmään. Ehdokkaita ei saa -julkistaa ennen kuin 50 vuotta on kulunut ehdolle asettumisen jälkeen. Aikaraja ehdotusten -lähettämiseen on 31. tammikuuta. Rauhanpalkinnon aikaraja on 1. helmikuuta. -QUOTE - - $expected_sentences = [ - 'Toisin kuin monissa muissa palkinnoissa, Nobel-palkinnon saajan valitseminen on pitkä prosessi.', - 'Tämä on nostanut palkinnon arvokkuutta ja sen takia palkintoa pidetään alansa arvostetuimpana.', -'Nobel-komiteat lähettävät vuosittain tuhansille eri alojen tiedemiehille, eri organisaatioiden ja akatemioiden jäsenille sekä edellisille Nobel-palkinnon saaneille viestin, jossa heitä pyydetään asettamaan ehdokas seuraavaksi palkinnon saajaksi.', -'Ehdolleasettajat pyritään valitsemaan siten, että mahdollisimman monet yliopistot ympäri maailmaa saavat asettaa ehdokkaita mahdollisimman tasa-arvoisesti.', -'Vuosittain ehdolle asetetaan 200–300 henkilöä (myös joitakin organisaatioita voidaan ehdottaa) kuhunkin palkintoryhmään.', - 'Ehdokkaita ei saa julkistaa ennen kuin 50 vuotta on kulunut ehdolle asettumisen jälkeen.', - 'Aikaraja ehdotusten lähettämiseen on 31. tammikuuta.', - 'Rauhanpalkinnon aikaraja on 1. helmikuuta.', - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Abbreviations, numbers - # - $test_string = <<'QUOTE'; -Vuotta 0 ei jostakin syystä ole otettu käyttöön juliaanisessa eikä gregoriaanisessa ajanlaskussa, -vaikka normaalisti ajan kulun laskeminen aloitetaan nollasta, kuten kalenterivuorokausi kello 0.00 -ja vasta ensimmäisen tunnin kuluttua on kello 1.00. Myös ihmisen syntymästä, jostakin tapahtumasta -tai sen alusta mennyt aika ilmoitetaan ajanlaskun alusta kuluneina täysinä vuosina: kun ensimmäinen -vuosi (vuosi 0) on mennyt, vasta silloin merkitään 1 tai kun kymmenes vuosi (vuosi 9) on kulunut, -on kymmenen vuotta täynnä ja alkaa vuosi 10 alkuhetkestä laskettuna. Ajanlaskun ensimmäisenä -pidetty vuosi on 1 jKr., ja vasta sen päätyttyä oli Kristuksen syntymästä kulunut 1 vuosi. -QUOTE - - $expected_sentences = [ -'Vuotta 0 ei jostakin syystä ole otettu käyttöön juliaanisessa eikä gregoriaanisessa ajanlaskussa, vaikka normaalisti ajan kulun laskeminen aloitetaan nollasta, kuten kalenterivuorokausi kello 0.00 ja vasta ensimmäisen tunnin kuluttua on kello 1.00.', -'Myös ihmisen syntymästä, jostakin tapahtumasta tai sen alusta mennyt aika ilmoitetaan ajanlaskun alusta kuluneina täysinä vuosina: kun ensimmäinen vuosi (vuosi 0) on mennyt, vasta silloin merkitään 1 tai kun kymmenes vuosi (vuosi 9) on kulunut, on kymmenen vuotta täynnä ja alkaa vuosi 10 alkuhetkestä laskettuna.', -'Ajanlaskun ensimmäisenä pidetty vuosi on 1 jKr., ja vasta sen päätyttyä oli Kristuksen syntymästä kulunut 1 vuosi.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } -} - -sub main() -{ - # Test::More UTF-8 output - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - test_split_text_to_sentences(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Languages/fr.t b/apps/common/tests/perl/MediaWords/Languages/fr.t deleted file mode 100644 index 3bbe6d694d..0000000000 --- a/apps/common/tests/perl/MediaWords/Languages/fr.t +++ /dev/null @@ -1,100 +0,0 @@ -# -# Some test strings copied from Wikipedia (CC-BY-SA, http://creativecommons.org/licenses/by-sa/3.0/). -# - -use strict; -use warnings; - -use Readonly; - -use Test::NoWarnings; -use Test::More tests => 2 + 1; -use utf8; - -use MediaWords::Languages::fr; -use Data::Dumper; - -sub test_split_text_to_sentences() -{ - my $test_string; - my $expected_sentences; - - my $lang = MediaWords::Languages::fr->new(); - - # - # Simple paragraph - # - $test_string = <<'QUOTE'; -Jusqu'aux années 2000, l'origine du cheval domestique est étudiée par synapomorphie, en comparant -des fossiles et squelettes. Les progrès de la génétique permettent désormais une autre approche, -le nombre de gènes entre les différentes espèces d'équidés étant variable. La différentiation -entre les espèces d’Equus laisse à penser que cette domestication est récente, et qu'elle concerne -un nombre restreint d'étalons pour un grand nombre de juments, capturées à l'état sauvage afin -de repeupler les élevages domestiques. Peu à peu, l'élevage sélectif entraîne une distinction des -chevaux selon leur usage, la traction ou la selle, et un accroissement de la variété des robes de -leurs robes. -QUOTE - - $expected_sentences = [ -'Jusqu\'aux années 2000, l\'origine du cheval domestique est étudiée par synapomorphie, en comparant des fossiles et squelettes.', -'Les progrès de la génétique permettent désormais une autre approche, le nombre de gènes entre les différentes espèces d\'équidés étant variable.', -'La différentiation entre les espèces d’Equus laisse à penser que cette domestication est récente, et qu\'elle concerne un nombre restreint d\'étalons pour un grand nombre de juments, capturées à l\'état sauvage afin de repeupler les élevages domestiques.', -'Peu à peu, l\'élevage sélectif entraîne une distinction des chevaux selon leur usage, la traction ou la selle, et un accroissement de la variété des robes de leurs robes.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Non-breakable abbreviation (e.g. "4500 av. J.-C.") - # - $test_string = <<'QUOTE'; -La domestication du cheval est l'ensemble des processus de domestication conduisant l'homme à -maîtriser puis à utiliser l'espèce Equus caballus (le cheval) à son profit grâce au contrôle -des naissances et à l'élevage de ces animaux pour la consommation, la guerre, le travail et -le transport. De nombreuses théories sont proposées, tant en termes d'époque, de nombre de -foyers de domestication, que de types, espèces ou sous-espèces de chevaux domestiqués. Plus -tardive que pour les espèces animales alimentaires, la domestication du cheval est difficile -à dater avec précision. Les premiers apprivoisements pourraient remonter au Paléolithique -supérieur, 8 000 ans avant notre ère. La première preuve archéologique date de 4500 av. J.-C. dans -les steppes au Nord du Kazakhstan, parmi la culture Botaï. D'autres éléments en évoquent -indépendamment dans la péninsule ibérique, et peut-être la péninsule arabique. Les recherches -précédentes se sont longtemps focalisées sur les steppes d'Asie centrale, vers 4000 à 3500 av. J.-C.. -QUOTE - - $expected_sentences = [ -'La domestication du cheval est l\'ensemble des processus de domestication conduisant l\'homme à maîtriser puis à utiliser l\'espèce Equus caballus (le cheval) à son profit grâce au contrôle des naissances et à l\'élevage de ces animaux pour la consommation, la guerre, le travail et le transport.', -'De nombreuses théories sont proposées, tant en termes d\'époque, de nombre de foyers de domestication, que de types, espèces ou sous-espèces de chevaux domestiqués.', -'Plus tardive que pour les espèces animales alimentaires, la domestication du cheval est difficile à dater avec précision.', - 'Les premiers apprivoisements pourraient remonter au Paléolithique supérieur, 8 000 ans avant notre ère.', -'La première preuve archéologique date de 4500 av. J.-C. dans les steppes au Nord du Kazakhstan, parmi la culture Botaï.', -'D\'autres éléments en évoquent indépendamment dans la péninsule ibérique, et peut-être la péninsule arabique.', -'Les recherches précédentes se sont longtemps focalisées sur les steppes d\'Asie centrale, vers 4000 à 3500 av. J.-C..' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } -} - -sub main() -{ - # Test::More UTF-8 output - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - test_split_text_to_sentences(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Languages/ha.t b/apps/common/tests/perl/MediaWords/Languages/ha.t deleted file mode 100644 index 4e4aa70f39..0000000000 --- a/apps/common/tests/perl/MediaWords/Languages/ha.t +++ /dev/null @@ -1,49 +0,0 @@ -use strict; -use warnings; -use utf8; - -use Test::More tests => 5; -use Test::NoWarnings; - -use MediaWords::Languages::ha; - -use Data::Dumper; -use Readonly; - -sub test_stem($) -{ - my $lang = shift; - - # https://github.com/mediacloud/hausastemmer/blob/develop/tests/ref_stems/with_dict_lookup.py - my $tokens_and_stems = { - - 'ababen' => 'ababe', - 'abin' => 'abin', - 'abincin' => 'abinci', - - # Empty tokens - '' => '', - }; - - for my $token ( keys %{ $tokens_and_stems } ) - { - my $expected_stem = $tokens_and_stems->{ $token }; - my $actual_stem = $lang->stem_words( [ $token ] )->[ 0 ]; - is( $actual_stem, $expected_stem, "stem_words(): $token" ); - } -} - -sub main() -{ - # Test::More UTF-8 output - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - my $lang = MediaWords::Languages::ha->new(); - - test_stem( $lang ); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Languages/hi.t b/apps/common/tests/perl/MediaWords/Languages/hi.t deleted file mode 100644 index 7865ff286a..0000000000 --- a/apps/common/tests/perl/MediaWords/Languages/hi.t +++ /dev/null @@ -1,103 +0,0 @@ -use strict; -use warnings; -use utf8; - -use Test::More tests => 24; -use Test::Differences; -use Test::NoWarnings; - -use MediaWords::Languages::hi; - -use Data::Dumper; -use Readonly; - -sub test_stem($) -{ - my $lang = shift; - -# https://github.com/apache/lucene-solr/blob/master/lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiStemmer.java - my $tokens_and_stems = { - - # Masculine noun inflections - 'लडका' => 'लडका', - 'लडके' => 'लडके', - 'लडकों' => 'लडकों', - 'गुरु' => 'गुरु', - 'गुरुओं' => 'गुरु', - 'दोस्त' => 'दोस्त', - 'दोस्तों' => 'दोस', - - # Feminine noun inflections - 'लडकी' => 'लडकी', - 'लडकियों' => 'लडकियों', - 'किताब' => 'किताब', - 'किताबें' => 'किताबे', - 'किताबों' => 'किताबो', - 'आध्यापीका' => 'आध्यापीका', - 'आध्यापीकाएं' => 'आध्यापीकाएं', - 'आध्यापीकाओं' => 'आध्यापीकाओं', - - # Some verb forms - 'खाना' => 'खाना', - 'खाता' => 'खाता', - 'खाती' => 'खाती', - 'खा' => 'खा', - - # Exceptions - 'कठिनाइयां' => 'कठिना', - 'कठिन' => 'कठिन', - - # Empty tokens - '' => '', - }; - - for my $token ( keys %{ $tokens_and_stems } ) - { - my $expected_stem = $tokens_and_stems->{ $token }; - my $actual_stem = $lang->stem_words( [ $token ] )->[ 0 ]; - is( $actual_stem, $expected_stem, "stem_words(): $token" ); - } -} - -sub test_split_text_to_sentences($) -{ - my $lang = shift; - - # - # Simple paragraph - # - my $input_text = <<'QUOTE'; -अंटार्कटिका (या अन्टार्टिका) पृथ्वी का दक्षिणतम महाद्वीप है, जिसमें दक्षिणी -ध्रुव अंतर्निहित है। यह दक्षिणी गोलार्द्ध के अंटार्कटिक क्षेत्र और लगभग पूरी तरह -से अंटार्कटिक वृत के दक्षिण में स्थित है। यह चारों ओर से दक्षिणी महासागर से घिरा -हुआ है। अपने 140 लाख वर्ग किलोमीटर (54 लाख वर्ग मील) क्षेत्रफल के साथ यह, एशिया, -अफ्रीका, उत्तरी अमेरिका और दक्षिणी अमेरिका के बाद, पृथ्वी का पांचवां सबसे बड़ा -महाद्वीप है, अंटार्कटिका का 98% भाग औसतन 1.6 किलोमीटर मोटी बर्फ से आच्छादित है। -QUOTE - - my $expected_sentences = [ -'अंटार्कटिका (या अन्टार्टिका) पृथ्वी का दक्षिणतम महाद्वीप है, जिसमें दक्षिणी ध्रुव अंतर्निहित है।', -'यह दक्षिणी गोलार्द्ध के अंटार्कटिक क्षेत्र और लगभग पूरी तरह से अंटार्कटिक वृत के दक्षिण में स्थित है।', -'यह चारों ओर से दक्षिणी महासागर से घिरा हुआ है।', -'अपने 140 लाख वर्ग किलोमीटर (54 लाख वर्ग मील) क्षेत्रफल के साथ यह, एशिया, अफ्रीका, उत्तरी अमेरिका और दक्षिणी अमेरिका के बाद, पृथ्वी का पांचवां सबसे बड़ा महाद्वीप है, अंटार्कटिका का 98% भाग औसतन 1.6 किलोमीटर मोटी बर्फ से आच्छादित है।', - ]; - my $actual_sentences = $lang->split_text_to_sentences( $input_text ); - - eq_or_diff( $actual_sentences, $expected_sentences ); -} - -sub main() -{ - # Test::More UTF-8 output - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - my $lang = MediaWords::Languages::hi->new(); - - test_stem( $lang ); - test_split_text_to_sentences( $lang ); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Languages/hu.t b/apps/common/tests/perl/MediaWords/Languages/hu.t deleted file mode 100644 index 1ef962bd9d..0000000000 --- a/apps/common/tests/perl/MediaWords/Languages/hu.t +++ /dev/null @@ -1,244 +0,0 @@ -# -# Some test strings copied from Wikipedia (CC-BY-SA, http://creativecommons.org/licenses/by-sa/3.0/). -# - -use strict; -use warnings; - -use Readonly; - -use Test::NoWarnings; -use Test::More tests => 9 + 1; -use utf8; - -use MediaWords::Languages::hu; -use Data::Dumper; - -sub test_split_text_to_sentences() -{ - my $test_string; - my $expected_sentences; - - my $lang = MediaWords::Languages::hu->new(); - - # - # Simple paragraph - # - $test_string = <<'QUOTE'; -Ifjúkoráról keveset tudni, a kor igényeinek megfelelően valószínűleg matematikát és hajózást tanult. -Miután Kolumbusz Kristóf spanyol zászló alatt hajózva felfedezte Amerikát 1492-ben, Portugália joggal -érezhette, hogy lépéshátrányba került nagy riválisával szemben. Öt esztendővel később a lisszaboni -kikötőből kifutott az első olyan flotta, amelyik Indiába akart eljutni azon az útvonalon, amelyet -Bartolomeu Dias megnyitott a portugálok számára. -QUOTE - - $expected_sentences = [ - 'Ifjúkoráról keveset tudni, a kor igényeinek megfelelően valószínűleg matematikát és hajózást tanult.', -'Miután Kolumbusz Kristóf spanyol zászló alatt hajózva felfedezte Amerikát 1492-ben, Portugália joggal érezhette, hogy lépéshátrányba került nagy riválisával szemben.', -'Öt esztendővel később a lisszaboni kikötőből kifutott az első olyan flotta, amelyik Indiába akart eljutni azon az útvonalon, amelyet Bartolomeu Dias megnyitott a portugálok számára.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Dates, abbreviations ("1845. febr. 8. Edgeworthstown, † 1926. febr. 13. Oxford"), brackets - # - $test_string = <<'QUOTE'; -Edgeworth, Francis Ysidro (1845. febr. 8. Edgeworthstown, † 1926. febr. 13. Oxford): ír közgazdász -és statisztikus, aki a közgazdaságtudományban maradandót alkotott a közömbösségi görbék rendszerének -megalkotásával. Nevéhez fűződik még a szerződési görbe és az úgynevezett Edgeworth-doboz vagy -Edgeworth-négyszög kidolgozása. ( Az utóbbit Pareto-féle box-diagrammnak is nevezik.) Mint -statisztikus, a korrelációszámítást fejlesztette tovább, s az index-számításban a bázis és a -tárgyidőszak fogyasztási szerkezettel számított indexek számtani átlagaként képzett indexet róla -nevezik Edgeworth-indexnek. -QUOTE - - $expected_sentences = [ - 'Edgeworth, Francis Ysidro (1845. febr. 8. Edgeworthstown, † 1926. febr. 13. Oxford): ír közgazdász és statisztikus, aki a közgazdaságtudományban maradandót alkotott a közömbösségi görbék rendszerének megalkotásával.', - 'Nevéhez fűződik még a szerződési görbe és az úgynevezett Edgeworth-doboz vagy Edgeworth-négyszög kidolgozása. ( Az utóbbit Pareto-féle box-diagrammnak is nevezik.)', - 'Mint statisztikus, a korrelációszámítást fejlesztette tovább, s az index-számításban a bázis és a tárgyidőszak fogyasztási szerkezettel számított indexek számtani átlagaként képzett indexet róla nevezik Edgeworth-indexnek.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Abbreviation ("Dr."), date ("Komárom, 1825. február 18. – Budapest, Erzsébetváros, 1904. május 5.") - # - $test_string = <<'QUOTE'; -Dr. Ásvay Jókai Móric (Komárom, 1825. február 18. – Budapest, Erzsébetváros, 1904. május 5.) -regényíró, a „nagy magyar mesemondó”, országgyűlési képviselő, főrendiházi tag, a Magyar -Tudományos Akadémia igazgató-tanácsának tagja, a Szent István-rend lovagja, a Kisfaludy -Társaság tagja, a Petőfi Társaság elnöke, a Dugonics Társaság tiszteletbeli tagja. -QUOTE - - $expected_sentences = [ -'Dr. Ásvay Jókai Móric (Komárom, 1825. február 18. – Budapest, Erzsébetváros, 1904. május 5.) regényíró, a „nagy magyar mesemondó”, országgyűlési képviselő, főrendiházi tag, a Magyar Tudományos Akadémia igazgató-tanácsának tagja, a Szent István-rend lovagja, a Kisfaludy Társaság tagja, a Petőfi Társaság elnöke, a Dugonics Társaság tiszteletbeli tagja.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Dates - # - $test_string = <<'QUOTE'; -Hszi Csin-ping (kínaiul: 习近平, pinjin, hangsúlyjelekkel: Xí Jìnpíng) (Fuping, Shaanxi -tartomány, 1953. június 1.) kínai politikus, 2008. március 15. óta a Kínai Népköztársaság -alelnöke, 2012. november 15. óta a KKP KB Politikai Bizottsága Állandó Bizottságának, -az ország de facto legfelső hatalmi grémiumának, valamint a KKP Központi Katonai -Bizottságának az elnöke. A várakozások szerint 2013 márciusától ő lesz a Kínai -Népköztársaság elnöke. 2010 óta számít az ország kijelölt következő vezetőjének. -QUOTE - - $expected_sentences = [ -'Hszi Csin-ping (kínaiul: 习近平, pinjin, hangsúlyjelekkel: Xí Jìnpíng) (Fuping, Shaanxi tartomány, 1953. június 1.) kínai politikus, 2008. március 15. óta a Kínai Népköztársaság alelnöke, 2012. november 15. óta a KKP KB Politikai Bizottsága Állandó Bizottságának, az ország de facto legfelső hatalmi grémiumának, valamint a KKP Központi Katonai Bizottságának az elnöke.', - 'A várakozások szerint 2013 márciusától ő lesz a Kínai Népköztársaság elnöke.', - '2010 óta számít az ország kijelölt következő vezetőjének.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Period in the middle of number - # - $test_string = <<'QUOTE'; -A döntőben hibátlan gyakorlatára 16.066-os pontszámot kapott, akárcsak Louis Smith; -a holtversenyt a gyakorlatának magasabb kivitelezési pontszáma döntötte el Berki -javára, aki megnyerte első olimpiai aranyérmét. -QUOTE - - $expected_sentences = [ -'A döntőben hibátlan gyakorlatára 16.066-os pontszámot kapott, akárcsak Louis Smith; a holtversenyt a gyakorlatának magasabb kivitelezési pontszáma döntötte el Berki javára, aki megnyerte első olimpiai aranyérmét.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Numbers - # - $test_string = <<'QUOTE'; -2002-ben a KSI sportolójaként a junior Európa-bajnokságon lólengésben második, -csapatban 11. volt. A felnőtt mesterfokú magyar bajnokságon megnyerte a lólengést. -A debreceni szerenkénti világbajnokságon kilencedik lett. 2004-ben a vk-sorozatban -Párizsban 13., Cottbusban hatodik volt. A következő évben Rio de Janeiróban -vk-versenyt nyert. A ljubljanai Eb-n csapatban 10., lólengésben bronzérmes lett. -A világkupában Glasgowban ötödik, Gentben negyedik, Stuttgartban harmadik lett. -A birminghami világkupa-döntőn hatodik helyezést ért el. -QUOTE - - $expected_sentences = [ - '2002-ben a KSI sportolójaként a junior Európa-bajnokságon lólengésben második, csapatban 11. volt.', - 'A felnőtt mesterfokú magyar bajnokságon megnyerte a lólengést.', - 'A debreceni szerenkénti világbajnokságon kilencedik lett.', - '2004-ben a vk-sorozatban Párizsban 13., Cottbusban hatodik volt.', - 'A következő évben Rio de Janeiróban vk-versenyt nyert.', - 'A ljubljanai Eb-n csapatban 10., lólengésben bronzérmes lett.', - 'A világkupában Glasgowban ötödik, Gentben negyedik, Stuttgartban harmadik lett.', - 'A birminghami világkupa-döntőn hatodik helyezést ért el.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Website name - # - $test_string = <<'QUOTE'; -Már előtte a Blikk.hu-n is megnéztem a cikket. Tetszenek a képek, nagyon boldog vagyok. -QUOTE - - $expected_sentences = - [ 'Már előtte a Blikk.hu-n is megnéztem a cikket.', 'Tetszenek a képek, nagyon boldog vagyok.' ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Name abbreviation - # - $test_string = <<'QUOTE'; -Nagy hatással volt rá W.H. Auden, aki többek közt első operájának, a Paul Bunyannak a szövegkönyvét írta. -QUOTE - - $expected_sentences = - [ -'Nagy hatással volt rá W.H. Auden, aki többek közt első operájának, a Paul Bunyannak a szövegkönyvét írta.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Roman numeral - # - $test_string = <<'QUOTE'; -1953-ban II. Erzsébet koronázására írta a Gloriana című operáját. -QUOTE - - $expected_sentences = [ '1953-ban II. Erzsébet koronázására írta a Gloriana című operáját.' ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } -} - -sub main() -{ - # Test::More UTF-8 output - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - test_split_text_to_sentences(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Languages/it.t b/apps/common/tests/perl/MediaWords/Languages/it.t deleted file mode 100644 index 826ed09e48..0000000000 --- a/apps/common/tests/perl/MediaWords/Languages/it.t +++ /dev/null @@ -1,108 +0,0 @@ -# -# Some test strings copied from Wikipedia (CC-BY-SA, http://creativecommons.org/licenses/by-sa/3.0/). -# - -use strict; -use warnings; - -use Readonly; - -use Test::NoWarnings; -use Test::More tests => 3 + 1; -use utf8; - -use MediaWords::Languages::it; -use Data::Dumper; - -sub test_split_text_to_sentences() -{ - - my $test_string; - my $expected_sentences; - - my $lang = MediaWords::Languages::it->new(); - - # - # Simple paragraph - # - $test_string = <<'QUOTE'; -Charles André Joseph Marie de Gaulle (Lilla, 22 novembre 1890 – Colombey-les-deux-Églises, 9 novembre 1970) è -stato un generale e politico francese. Dopo la sua partenza per Londra nel giugno del 1940, divenne il capo -della Francia libera, che ha combattuto contro il regime di Vichy e contro l'occupazione italiana e tedesca -della Francia durante la seconda guerra mondiale. Presidente del governo provvisorio della Repubblica -francese 1944-1946, ultimo presidente del Consiglio (1958-1959) della Quarta Repubblica, è stato il promotore -della fondazione della Quinta Repubblica, della quale fu primo presidente dal 1959-1969. -QUOTE - - $expected_sentences = [ -'Charles André Joseph Marie de Gaulle (Lilla, 22 novembre 1890 – Colombey-les-deux-Églises, 9 novembre 1970) è stato un generale e politico francese.', -'Dopo la sua partenza per Londra nel giugno del 1940, divenne il capo della Francia libera, che ha combattuto contro il regime di Vichy e contro l\'occupazione italiana e tedesca della Francia durante la seconda guerra mondiale.', -'Presidente del governo provvisorio della Repubblica francese 1944-1946, ultimo presidente del Consiglio (1958-1959) della Quarta Repubblica, è stato il promotore della fondazione della Quinta Repubblica, della quale fu primo presidente dal 1959-1969.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Period in the middle of the number - # - $test_string = <<'QUOTE'; -Nel 1964, l'azienda di Berlusconi apre un cantiere a Brugherio per edificare una città modello da 4.000 abitanti. -I primi condomini sono pronti già nel 1965, ma non si vendono con facilità. -QUOTE - - $expected_sentences = [ -'Nel 1964, l\'azienda di Berlusconi apre un cantiere a Brugherio per edificare una città modello da 4.000 abitanti.', - 'I primi condomini sono pronti già nel 1965, ma non si vendono con facilità.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Acronym ("c.a.p.") - # - $test_string = <<'QUOTE'; -La precompressione è una tecnica industriale consistente nel produrre artificialmente una tensione nella -struttura dei materiali da costruzione, e in special modo nel calcestruzzo armato, allo scopo di migliorarne -le caratteristiche di resistenza. Nel calcestruzzo armato precompresso (nel linguaggio comune chiamato -anche cemento armato precompresso, abbreviato con l'acronimo c.a.p.), la precompressione viene -utilizzata per sopperire alla scarsa resistenza a trazione del conglomerato cementizio. -QUOTE - - $expected_sentences = [ -'La precompressione è una tecnica industriale consistente nel produrre artificialmente una tensione nella struttura dei materiali da costruzione, e in special modo nel calcestruzzo armato, allo scopo di migliorarne le caratteristiche di resistenza.', -'Nel calcestruzzo armato precompresso (nel linguaggio comune chiamato anche cemento armato precompresso, abbreviato con l\'acronimo c.a.p.), la precompressione viene utilizzata per sopperire alla scarsa resistenza a trazione del conglomerato cementizio.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } -} - -sub main() -{ - # Test::More UTF-8 output - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - test_split_text_to_sentences(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Languages/ja.t b/apps/common/tests/perl/MediaWords/Languages/ja.t deleted file mode 100644 index a682f41e55..0000000000 --- a/apps/common/tests/perl/MediaWords/Languages/ja.t +++ /dev/null @@ -1,65 +0,0 @@ -# -# Basic Japanese tokenizer test -# (more extensive testing is being on the Python side) -# - -use strict; -use warnings; -use utf8; - -use Test::More tests => 3; -use Test::Deep; -use Test::NoWarnings; - -use MediaWords::Languages::ja; - -use Data::Dumper; -use Readonly; - -sub test_split_text_to_sentences($) -{ - my $lang = shift; - - my $input_text = <<'QUOTE'; -ジアゼパムはてんかんや興奮の治療に用いられる。 -This is some English text out of the blue. -また、有痛性筋痙攣(いわゆる“こむらがえり”)などの筋痙攣の治療にはベンゾジアゼピン類の中で最も有用であるとされている。 -This is some more English text. -QUOTE - - my $expected_sentences = [ - 'ジアゼパムはてんかんや興奮の治療に用いられる。', - 'This is some English text out of the blue.', -'また、有痛性筋痙攣(いわゆる“こむらがえり”)などの筋痙攣の治療にはベンゾジアゼピン類の中で最も有用であるとされている。', - 'This is some more English text.', - ]; - my $actual_sentences = $lang->split_text_to_sentences( $input_text ); - - cmp_deeply( $actual_sentences, $expected_sentences ); -} - -sub test_tokenize($) -{ - my $lang = shift; - - my $input_sentence = 'pythonが大好きです'; - my $expected_words = [ 'python', '大好き', ]; - my $actual_words = $lang->split_sentence_to_words( $input_sentence ); - - cmp_deeply( $actual_words, $expected_words, 'tokenize()' ); -} - -sub main() -{ - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - my $lang = MediaWords::Languages::ja->new(); - - test_split_text_to_sentences( $lang ); - test_tokenize( $lang ); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Languages/lt.t b/apps/common/tests/perl/MediaWords/Languages/lt.t deleted file mode 100644 index 881f0de5da..0000000000 --- a/apps/common/tests/perl/MediaWords/Languages/lt.t +++ /dev/null @@ -1,135 +0,0 @@ -# -# Some test strings copied from Wikipedia (CC-BY-SA, http://creativecommons.org/licenses/by-sa/3.0/). -# - -use strict; -use warnings; - -use Readonly; - -use Test::NoWarnings; -use Test::More tests => 4 + 1; -use utf8; - -use MediaWords::Languages::lt; -use Data::Dumper; - -sub test_split_text_to_sentences() -{ - my $test_string; - my $expected_sentences; - - my $lang = MediaWords::Languages::lt->new(); - - # - # Simple paragraph - # - $test_string = <<'QUOTE'; -Kinijos civilizacija yra viena seniausių pasaulyje. Kinijos istorija pasižymi gausa įvairių -rašytinių šaltinių, kurie, kartu su archeologiniais duomenimis, leidžia rekonstruoti -politinį Kinijos gyvenimą ir socialius procesus pradedant gilia senove. Politiškai Kinija -per keletą tūkstantmečių keletą kartų perėjo per besikartojančius politinės vienybės ir -susiskaidymo ciklus. Kinijos teritoriją reguliariai užkariaudavo ateiviai iš išorės, tačiau -daugelis jų anksčiau ar vėliau buvo asimiliuojami į kinų etnosą. -QUOTE - - $expected_sentences = [ - 'Kinijos civilizacija yra viena seniausių pasaulyje.', -'Kinijos istorija pasižymi gausa įvairių rašytinių šaltinių, kurie, kartu su archeologiniais duomenimis, leidžia rekonstruoti politinį Kinijos gyvenimą ir socialius procesus pradedant gilia senove.', -'Politiškai Kinija per keletą tūkstantmečių keletą kartų perėjo per besikartojančius politinės vienybės ir susiskaidymo ciklus.', -'Kinijos teritoriją reguliariai užkariaudavo ateiviai iš išorės, tačiau daugelis jų anksčiau ar vėliau buvo asimiliuojami į kinų etnosą.' - ]; - - { - is( join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), join( '||', @{ $expected_sentences } ) ); - } - - # - # Abbreviated name ("S. Daukanto") - # - $test_string = <<'QUOTE'; -Lenkimų senosios kapinės, Pušų kapai, Maro kapeliai, Kapeliai (saugotinas kultūros paveldo -objektas) – neveikiančios kapinės vakariniame Skuodo rajono savivaldybės teritorijos -pakraštyje, 1,9 km į rytus nuo Šventosios upės ir Latvijos sienos, Lenkimų miestelio -(Lenkimų seniūnija) pietvakariniame pakraštyje, kelio Skuodas–Kretinga (S. Daukanto -gatvės) dešinėje pusėje. Įrengtos šiaurės – pietų kryptimi pailgoje kalvelėje, apjuostos -statinių tvoros, kurios rytinėje pusėje įrengti varteliai. Kapinių pakraščiuose auga kelios -pušys, o centrinėje dalyje – vietinės reikšmės gamtos paminklu laikoma Kapų pušis. Į pietus -nuo jos stovi monumentalus kryžius ir pora koplytėlių. Pietinėje dalyje išliko pora betoninių -antkapių, ženklinančių buvusius kapus. Priešais kapines pakelėje pastatytas stogastulpio -tipo anotacinis ženklas su įrašu „PUŠŲ KAPAI“. Teritorijos plotas – 0,06 ha. -QUOTE - - $expected_sentences = [ -'Lenkimų senosios kapinės, Pušų kapai, Maro kapeliai, Kapeliai (saugotinas kultūros paveldo objektas) – neveikiančios kapinės vakariniame Skuodo rajono savivaldybės teritorijos pakraštyje, 1,9 km į rytus nuo Šventosios upės ir Latvijos sienos, Lenkimų miestelio (Lenkimų seniūnija) pietvakariniame pakraštyje, kelio Skuodas–Kretinga (S. Daukanto gatvės) dešinėje pusėje.', -'Įrengtos šiaurės – pietų kryptimi pailgoje kalvelėje, apjuostos statinių tvoros, kurios rytinėje pusėje įrengti varteliai.', -'Kapinių pakraščiuose auga kelios pušys, o centrinėje dalyje – vietinės reikšmės gamtos paminklu laikoma Kapų pušis.', - 'Į pietus nuo jos stovi monumentalus kryžius ir pora koplytėlių.', - 'Pietinėje dalyje išliko pora betoninių antkapių, ženklinančių buvusius kapus.', - 'Priešais kapines pakelėje pastatytas stogastulpio tipo anotacinis ženklas su įrašu „PUŠŲ KAPAI“.', - 'Teritorijos plotas – 0,06 ha.' - ]; - - { - is( join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), join( '||', @{ $expected_sentences } ) ); - } - - # - # Date ("1338 m. rugpjūčio 14 d."), abbreviation ("vok.") - # - $test_string = <<'QUOTE'; -Galialaukių mūšis – 1338 m. rugpjūčio 14 d. netoli Ragainės pilies vykusios kautynės tarp -LDK ir Vokiečių ordino kariuomenių. Ordino maršalo Heinricho Dusmerio vadovaujami -kryžiuočiai Galialaukių vietovėje (vok. Galelouken, Galelauken) pastojo kelią lietuviams, -grįžtantiems į Lietuvą po trijų dienų niokojamo žygio į Prūsiją, surengto greičiausiai -keršijant ordinui už Bajerburgo pilies pastatymą bei Medininkų valsčiaus nuniokojimą. -QUOTE - - $expected_sentences = [ -'Galialaukių mūšis – 1338 m. rugpjūčio 14 d. netoli Ragainės pilies vykusios kautynės tarp LDK ir Vokiečių ordino kariuomenių.', -'Ordino maršalo Heinricho Dusmerio vadovaujami kryžiuočiai Galialaukių vietovėje (vok. Galelouken, Galelauken) pastojo kelią lietuviams, grįžtantiems į Lietuvą po trijų dienų niokojamo žygio į Prūsiją, surengto greičiausiai keršijant ordinui už Bajerburgo pilies pastatymą bei Medininkų valsčiaus nuniokojimą.' - ]; - - { - is( join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), join( '||', @{ $expected_sentences } ) ); - } - - # - # Dates ("II tūkst. pr. m. e." and others), abbreviation ("kin.") - # - $test_string = <<'QUOTE'; -Daugiausia žinių yra išlikę apie Geltonosios upės vidurupio (taip vadinamos Vidurio lygumos) -arealo raidą, kur jau II tūkst. pr. m. e. viduryje į valdžią atėjo pusiau legendinė Šia -dinastija, kurią pakeitė Šangų dinastija. XI a. pr. m. e. čia įsigalėjo Džou dinastija. -Tuo metu Vidurio lygumos karalystė pradėta vadinti tiesiog "Vidurio karalyste" (kin. -Zhongguo), kas ir davė pavadinimą visai Kinijai. Valdant Džou dinastijai, jos monarchų -simbolinis autoritetas išplito po didžiulę teritoriją. Nors atskiros Kinijos valstybės kovojo -tarpusavyje, kultūriniai mainai intensyvėjo, kas ilgainiui vedė į politinį suvienijimą -III a. pr. m. e. -QUOTE - - $expected_sentences = [ -'Daugiausia žinių yra išlikę apie Geltonosios upės vidurupio (taip vadinamos Vidurio lygumos) arealo raidą, kur jau II tūkst. pr. m. e. viduryje į valdžią atėjo pusiau legendinė Šia dinastija, kurią pakeitė Šangų dinastija.', - 'XI a. pr. m. e. čia įsigalėjo Džou dinastija.', -'Tuo metu Vidurio lygumos karalystė pradėta vadinti tiesiog "Vidurio karalyste" (kin. Zhongguo), kas ir davė pavadinimą visai Kinijai.', - 'Valdant Džou dinastijai, jos monarchų simbolinis autoritetas išplito po didžiulę teritoriją.', -'Nors atskiros Kinijos valstybės kovojo tarpusavyje, kultūriniai mainai intensyvėjo, kas ilgainiui vedė į politinį suvienijimą III a. pr. m. e.' - ]; - - { - is( join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), join( '||', @{ $expected_sentences } ) ); - } -} - -sub main() -{ - # Test::More UTF-8 output - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - test_split_text_to_sentences(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Languages/nl.t b/apps/common/tests/perl/MediaWords/Languages/nl.t deleted file mode 100644 index 31076845a4..0000000000 --- a/apps/common/tests/perl/MediaWords/Languages/nl.t +++ /dev/null @@ -1,113 +0,0 @@ -# -# Some test strings copied from Wikipedia (CC-BY-SA, http://creativecommons.org/licenses/by-sa/3.0/). -# - -use strict; -use warnings; - -use Readonly; - -use Test::NoWarnings; -use Test::More tests => 3 + 1; -use utf8; - -use MediaWords::Languages::nl; -use Data::Dumper; - -sub test_split_text_to_sentences() -{ - my $test_string; - my $expected_sentences; - - my $lang = MediaWords::Languages::nl->new(); - - # - # Simple paragraph - # - $test_string = <<'QUOTE'; -Onder neogotiek wordt een 19e-eeuwse stroming in de architectuur verstaan die zich geheel heeft -laten inspireren door de middeleeuwse gotiek. De neogotiek ontstond in Engeland en was een -reactie op de strakke, koele vormen van het classicisme met haar uitgesproken rationele karakter. -De neogotiek vond haar oorsprong in de romantiek met haar belangstelling voor de middeleeuwen. -QUOTE - - $expected_sentences = [ -'Onder neogotiek wordt een 19e-eeuwse stroming in de architectuur verstaan die zich geheel heeft laten inspireren door de middeleeuwse gotiek.', -'De neogotiek ontstond in Engeland en was een reactie op de strakke, koele vormen van het classicisme met haar uitgesproken rationele karakter.', - 'De neogotiek vond haar oorsprong in de romantiek met haar belangstelling voor de middeleeuwen.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Period in the middle of the number - # - $test_string = <<'QUOTE'; -De vulkaan, meestal gewoon Tongariro genoemd, heeft een hoogte van 1978 meter. Ruim 260.000 -jaar geleden barstte de vulkaan voor het eerst uit. De Tongariro bestaat uit ten minste twaalf -toppen. De Ngarahoe, vaak gezien als een aparte berg, is eigenlijk een bergtop met krater -van de Tongariro. Het is de meest actieve vulkaan in het gebied. Sinds 1839 hebben er meer -dan zeventig uitbarstingen plaatsgevonden. De meest recente uitbarsting was op 21 november -2012 om 13:22 uur, waarbij een aswolk tot 4213 m is gerapporteerd. Dit was slechts 3,5 maand -na de voorlaatste uitbarsting op 6 augustus 2012. -QUOTE - - $expected_sentences = [ - 'De vulkaan, meestal gewoon Tongariro genoemd, heeft een hoogte van 1978 meter.', - 'Ruim 260.000 jaar geleden barstte de vulkaan voor het eerst uit.', - 'De Tongariro bestaat uit ten minste twaalf toppen.', - 'De Ngarahoe, vaak gezien als een aparte berg, is eigenlijk een bergtop met krater van de Tongariro.', - 'Het is de meest actieve vulkaan in het gebied.', - 'Sinds 1839 hebben er meer dan zeventig uitbarstingen plaatsgevonden.', - 'De meest recente uitbarsting was op 21 november 2012 om 13:22 uur, waarbij een aswolk tot 4213 m is gerapporteerd.', - 'Dit was slechts 3,5 maand na de voorlaatste uitbarsting op 6 augustus 2012.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Abbreviation ("m.a.w") - # - $test_string = <<'QUOTE'; -Aeroob betekent dat een organisme alleen met zuurstof kan gedijen, m.a.w dat het zuurstof -gebruikt. Dit in tegenstelling tot anaerobe organismen, die geen zuurstof nodig hebben. -QUOTE - - $expected_sentences = [ - 'Aeroob betekent dat een organisme alleen met zuurstof kan gedijen, m.a.w dat het zuurstof gebruikt.', - 'Dit in tegenstelling tot anaerobe organismen, die geen zuurstof nodig hebben.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } -} - -sub main() -{ - # Test::More UTF-8 output - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - test_split_text_to_sentences(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Languages/no.t b/apps/common/tests/perl/MediaWords/Languages/no.t deleted file mode 100644 index 65068e958e..0000000000 --- a/apps/common/tests/perl/MediaWords/Languages/no.t +++ /dev/null @@ -1,107 +0,0 @@ -# -# Some test strings copied from Wikipedia (CC-BY-SA, http://creativecommons.org/licenses/by-sa/3.0/). -# - -use strict; -use warnings; - -use Readonly; - -use Test::NoWarnings; -use Test::More tests => 3 + 1; -use utf8; - -use MediaWords::Languages::no; -use Data::Dumper; - -sub test_split_text_to_sentences() -{ - my $test_string; - my $expected_sentences; - - my $lang = MediaWords::Languages::no->new(); - - # - # Simple paragraph - # - $test_string = <<'QUOTE'; -Tuvalu er en øynasjon i Polynesia i Stillehavet. Landet har i overkant av 10 000 innbyggere, -og er dermed den selvstendige staten i verden med tredje færrest innbyggere, etter -Vatikanstaten og Nauru. Tuvalu består av ni bebodde atoller spredt over et havområde på -rundt 1,3 millioner km². Med et landareal på bare 26 km² er det verdens fjerde minste -uavhengige stat. De nærmeste øygruppene er Kiribati, Nauru, Samoa og Fiji. -QUOTE - - $expected_sentences = [ - 'Tuvalu er en øynasjon i Polynesia i Stillehavet.', -'Landet har i overkant av 10 000 innbyggere, og er dermed den selvstendige staten i verden med tredje færrest innbyggere, etter Vatikanstaten og Nauru.', - 'Tuvalu består av ni bebodde atoller spredt over et havområde på rundt 1,3 millioner km².', - 'Med et landareal på bare 26 km² er det verdens fjerde minste uavhengige stat.', - 'De nærmeste øygruppene er Kiribati, Nauru, Samoa og Fiji.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Date ("1. oktober 1978") - # - $test_string = <<'QUOTE'; -De første innbyggerne på Tuvalu var polynesiske folk. Den spanske oppdageren Álvaro de -Mendaña ble i 1568 den første europeeren som fikk øye på landet. I 1819 fikk det navnet -Elliceøyene. Det kom under britisk innflytelse på slutten av 1800-tallet, og fra 1892 -til 1976 utgjorde det en del av det britiske protektoratet og kolonien Gilbert- og -Elliceøyene, sammen med en del av dagens Kiribati. Tuvalu ble selvstendig 1. oktober 1978. -QUOTE - - $expected_sentences = [ - 'De første innbyggerne på Tuvalu var polynesiske folk.', - 'Den spanske oppdageren Álvaro de Mendaña ble i 1568 den første europeeren som fikk øye på landet.', - 'I 1819 fikk det navnet Elliceøyene.', -'Det kom under britisk innflytelse på slutten av 1800-tallet, og fra 1892 til 1976 utgjorde det en del av det britiske protektoratet og kolonien Gilbert- og Elliceøyene, sammen med en del av dagens Kiribati.', - 'Tuvalu ble selvstendig 1. oktober 1978.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Abbreviation - # - $test_string = <<'QUOTE'; -Tettest er den på hovedatollen Funafuti, med over 1000 innb./km². -QUOTE - - $expected_sentences = [ 'Tettest er den på hovedatollen Funafuti, med over 1000 innb./km².' ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } -} - -sub main() -{ - # Test::More UTF-8 output - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - test_split_text_to_sentences(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Languages/pt.t b/apps/common/tests/perl/MediaWords/Languages/pt.t deleted file mode 100644 index cee4fa40da..0000000000 --- a/apps/common/tests/perl/MediaWords/Languages/pt.t +++ /dev/null @@ -1,138 +0,0 @@ -# -# Some test strings copied from Wikipedia (CC-BY-SA, http://creativecommons.org/licenses/by-sa/3.0/). -# - -use strict; -use warnings; - -use Readonly; - -use Test::NoWarnings; -use Test::More tests => 4 + 1; -use utf8; - -use MediaWords::Languages::pt; -use Data::Dumper; - -sub test_split_text_to_sentences() -{ - my $test_string; - my $expected_sentences; - - my $lang = MediaWords::Languages::pt->new(); - - # - # Simple paragraph - # - $test_string = <<'QUOTE'; -França (em francês: France; AFI: [fʁɑ̃s] ouça), oficialmente República Francesa (em francês: -République française; [ʁepyblik fʁɑ̃sɛz]) é um país localizado na Europa Ocidental, com várias -ilhas e territórios ultramarinos noutros continentes. A França Metropolitana se estende do -Mediterrâneo ao Canal da Mancha e Mar do Norte, e do Rio Reno ao Oceano Atlântico. É muitas -vezes referida como L'Hexagone ("O Hexágono") por causa da forma geométrica do seu território. -A nação é o maior país da União Europeia em área e o terceiro maior da Europa, atrás apenas da -Rússia e da Ucrânia (incluindo seus territórios extraeuropeus, como a Guiana Francesa, o país -torna-se maior que a Ucrânia). -QUOTE - - $expected_sentences = [ -'França (em francês: France; AFI: [fʁɑ̃s] ouça), oficialmente República Francesa (em francês: République française; [ʁepyblik fʁɑ̃sɛz]) é um país localizado na Europa Ocidental, com várias ilhas e territórios ultramarinos noutros continentes.', -'A França Metropolitana se estende do Mediterrâneo ao Canal da Mancha e Mar do Norte, e do Rio Reno ao Oceano Atlântico.', - 'É muitas vezes referida como L\'Hexagone ("O Hexágono") por causa da forma geométrica do seu território.', -'A nação é o maior país da União Europeia em área e o terceiro maior da Europa, atrás apenas da Rússia e da Ucrânia (incluindo seus territórios extraeuropeus, como a Guiana Francesa, o país torna-se maior que a Ucrânia).' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Period in the middle of the number ("1:26.250") - # - $test_string = <<'QUOTE'; -O Grande Prêmio da Espanha de 2012 foi a quinta corrida da temporada de 2012 da Fórmula 1. A -prova foi disputada no dia 13 de maio no Circuito da Catalunha, em Barcelona, com treino de -classificação no sábado dia 12 de maio. O primeiro treino livre de sexta-feira teve Fernando -Alonso como líder, já a segunda sessão do mesmo dia foi liderado por Jenson Button. No dia -seguinte, a terceira sessão foi dominada por Sebastian Vettel. O pole position havia sido -Lewis Hamilton, entretanto, o piloto inglês foi punido, sendo excluído do classificatório. -Quem herdou a pole position foi o venezuelano Pastor Maldonado, tornando-se o primeiro -venezuelano na história a conquistar a posição de honra na categoria. Maldonado veio a vencer -a prova no dia seguinte e tornou-se também o primeiro venezuelano na história a vencer uma -corrida de Formula 1. O pódio foi completado por Fernando Alonso, da Ferrari, e Kimi Raikkonen, -da Lotus. A volta mais rápida da corrida foi feita pelo francês Romain Grosjean da Lotus com -o tempo de 1:26.250. -QUOTE - - $expected_sentences = [ - 'O Grande Prêmio da Espanha de 2012 foi a quinta corrida da temporada de 2012 da Fórmula 1.', -'A prova foi disputada no dia 13 de maio no Circuito da Catalunha, em Barcelona, com treino de classificação no sábado dia 12 de maio.', -'O primeiro treino livre de sexta-feira teve Fernando Alonso como líder, já a segunda sessão do mesmo dia foi liderado por Jenson Button.', - 'No dia seguinte, a terceira sessão foi dominada por Sebastian Vettel.', -'O pole position havia sido Lewis Hamilton, entretanto, o piloto inglês foi punido, sendo excluído do classificatório.', -'Quem herdou a pole position foi o venezuelano Pastor Maldonado, tornando-se o primeiro venezuelano na história a conquistar a posição de honra na categoria.', -'Maldonado veio a vencer a prova no dia seguinte e tornou-se também o primeiro venezuelano na história a vencer uma corrida de Formula 1.', - 'O pódio foi completado por Fernando Alonso, da Ferrari, e Kimi Raikkonen, da Lotus.', - 'A volta mais rápida da corrida foi feita pelo francês Romain Grosjean da Lotus com o tempo de 1:26.250.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Abbreviation ("a.C.") with an end-of-sentence period - # - $test_string = <<'QUOTE'; -Segundo a lenda, Rômulo e Remo fundaram Roma em 753 a.C.. -QUOTE - - $expected_sentences = [ 'Segundo a lenda, Rômulo e Remo fundaram Roma em 753 a.C..' ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Abbreviation ("a.C.") with an end-of-sentence period, plus another sentence - # - $test_string = <<'QUOTE'; -Segundo a lenda, Rômulo e Remo fundaram Roma em 753 a.C.. This is a test. -QUOTE - - $expected_sentences = [ 'Segundo a lenda, Rômulo e Remo fundaram Roma em 753 a.C..', 'This is a test.' ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } -} - -sub main() -{ - # Test::More UTF-8 output - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - test_split_text_to_sentences(); -} - -main(); - diff --git a/apps/common/tests/perl/MediaWords/Languages/ro.t b/apps/common/tests/perl/MediaWords/Languages/ro.t deleted file mode 100644 index 7ff1478569..0000000000 --- a/apps/common/tests/perl/MediaWords/Languages/ro.t +++ /dev/null @@ -1,111 +0,0 @@ -# -# Some test strings copied from Wikipedia (CC-BY-SA, http://creativecommons.org/licenses/by-sa/3.0/). -# - -use strict; -use warnings; - -use Readonly; - -use Test::NoWarnings; -use Test::More tests => 3 + 1; -use utf8; - -use MediaWords::Languages::ro; -use Data::Dumper; - -sub test_split_text_to_sentences() -{ - my $test_string; - my $expected_sentences; - - my $lang = MediaWords::Languages::ro->new(); - - # - # Simple paragraph - # - $test_string = <<'QUOTE'; -În prezent, din întreg ansamblul mănăstirii s-a mai păstrat doar biserica și o clopotniță. -Acestea se află amplasate pe strada Sapienței din sectorul 5 al municipiului București, -în spatele unor blocuri construite în timpul regimului comunist, din apropierea Splaiului -Independenței și a parcului Izvor. În 1813 Mănăstirea Mihai-Vodă „era printre mănăstirile -mari ale țării”. -QUOTE - - $expected_sentences = [ - 'În prezent, din întreg ansamblul mănăstirii s-a mai păstrat doar biserica și o clopotniță.', -'Acestea se află amplasate pe strada Sapienței din sectorul 5 al municipiului București, în spatele unor blocuri construite în timpul regimului comunist, din apropierea Splaiului Independenței și a parcului Izvor.', - 'În 1813 Mănăstirea Mihai-Vodă „era printre mănăstirile mari ale țării”.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Names ("Sf. Mc. Trifon" and others) - # - $test_string = <<'QUOTE'; -În prezent în interiorul bisericii există o raclă în care sunt păstrate moștele -următorilor Sfinți: Sf. Ioan Iacob Hozevitul, Sf. Xenia Petrovna, Sf. Teofil, Sf. Mc. -Sevastiana, Sf. Mc. Ciprian, Sf. Mc. Iustina, Sf. Mc. Clement, Sf. Mc. Trifon, Cuv. -Auxenție, Sf. Dionisie Zakynthos, Sf. Mc. Anastasie, Sf. Mc. Panaghiotis, Sf. Spiridon, -Sf. Nifon II, Sf. Ignatie Zagorski, Sf. Prooroc Ioan Botezătorul, Cuv. Sava cel Sfințit, -Sf. Mc. Eustatie, Sf. Mc. Theodor Stratilat, Cuv. Paisie, Cuv. Stelian Paflagonul, Sf. -Mc. Mercurie, Sf. Mc. Arhidiacon Ștefan, Sf. Apostol Andrei, Sf. Mc. Dimitrie, Sf. Mc. -Haralambie. -QUOTE - - $expected_sentences = [ -'În prezent în interiorul bisericii există o raclă în care sunt păstrate moștele următorilor Sfinți: Sf. Ioan Iacob Hozevitul, Sf. Xenia Petrovna, Sf. Teofil, Sf. Mc. Sevastiana, Sf. Mc. Ciprian, Sf. Mc. Iustina, Sf. Mc. Clement, Sf. Mc. Trifon, Cuv. Auxenție, Sf. Dionisie Zakynthos, Sf. Mc. Anastasie, Sf. Mc. Panaghiotis, Sf. Spiridon, Sf. Nifon II, Sf. Ignatie Zagorski, Sf. Prooroc Ioan Botezătorul, Cuv. Sava cel Sfințit, Sf. Mc. Eustatie, Sf. Mc. Theodor Stratilat, Cuv. Paisie, Cuv. Stelian Paflagonul, Sf. Mc. Mercurie, Sf. Mc. Arhidiacon Ștefan, Sf. Apostol Andrei, Sf. Mc. Dimitrie, Sf. Mc. Haralambie.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Abbreviation ("nr.4") - # - $test_string = <<'QUOTE'; -Translatarea în pantă a bisericii, pe o distanță de 289 m și coborâtă pe verticală cu -6,2 m, a avut loc în anul 1985. Operațiune în sine de translatare a edificiului, de -pe Dealul Mihai Vodă, fosta stradă a Arhivelor nr.2 și până în locul în care se află și -astăzi, Strada Sapienței nr.4, în apropierea malului Dâmboviței, a fost considerată la -vremea respectivă o performanță deosebită. -QUOTE - - $expected_sentences = [ -'Translatarea în pantă a bisericii, pe o distanță de 289 m și coborâtă pe verticală cu 6,2 m, a avut loc în anul 1985.', -'Operațiune în sine de translatare a edificiului, de pe Dealul Mihai Vodă, fosta stradă a Arhivelor nr.2 și până în locul în care se află și astăzi, Strada Sapienței nr.4, în apropierea malului Dâmboviței, a fost considerată la vremea respectivă o performanță deosebită.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } -} - -sub main() -{ - # Test::More UTF-8 output - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - test_split_text_to_sentences(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Languages/ru.t b/apps/common/tests/perl/MediaWords/Languages/ru.t deleted file mode 100644 index 7f14e8a3b2..0000000000 --- a/apps/common/tests/perl/MediaWords/Languages/ru.t +++ /dev/null @@ -1,247 +0,0 @@ -# -# Some test strings copied from Wikipedia (CC-BY-SA, http://creativecommons.org/licenses/by-sa/3.0/). -# - -use strict; -use warnings; - -use Readonly; - -use Test::NoWarnings; -use Test::More tests => 13; -use utf8; - -use MediaWords::Languages::ru; -use Data::Dumper; - -sub test_stopwords() -{ - my $lang = MediaWords::Languages::ru->new(); - - ok( $lang->stop_words_map() ); - - # Stop words - my $stop_words_ru = $lang->stop_words_map(); - ok( scalar( keys( %{ $stop_words_ru } ) ) >= 140, "stop words (ru) count is correct" ); - - is( $stop_words_ru->{ 'и' }, 1, "Russian test #1" ); - is( $stop_words_ru->{ 'я' }, 1, "Russian test #2" ); -} - -sub test_split_text_to_sentences() -{ - my $test_string; - my $expected_sentences; - - my $lang = MediaWords::Languages::ru->new(); - - # - # Simple paragraph + some non-breakable abbreviations - # - $test_string = <<'QUOTE'; -Новозеландцы пять раз признавались командой года по версии IRB и являются лидером по количеству набранных -очков и единственным коллективом в международном регби, имеющим положительный баланс встреч со всеми своими -соперниками. «Олл Блэкс» удерживали первую строчку в рейтинге сборных Международного совета регби дольше, -чем все остальные команды вместе взятые. За последние сто лет новозеландцы уступали лишь шести национальным -командам (Австралия, Англия, Родезия, Уэльс, Франция и ЮАР). Также в своём активе победу над «чёрными» имеют -сборная Британских островов (англ.)русск. и сборная мира (англ.)русск., которые не являются официальными -членами IRB. Более 75 % матчей сборной с 1903 года завершались победой «Олл Блэкс» — по этому показателю -национальная команда превосходит все остальные. -QUOTE - - $expected_sentences = [ -'Новозеландцы пять раз признавались командой года по версии IRB и являются лидером по количеству набранных очков и единственным коллективом в международном регби, имеющим положительный баланс встреч со всеми своими соперниками.', -'«Олл Блэкс» удерживали первую строчку в рейтинге сборных Международного совета регби дольше, чем все остальные команды вместе взятые.', -'За последние сто лет новозеландцы уступали лишь шести национальным командам (Австралия, Англия, Родезия, Уэльс, Франция и ЮАР).', -'Также в своём активе победу над «чёрными» имеют сборная Британских островов (англ.)русск. и сборная мира (англ.)русск., которые не являются официальными членами IRB.', -'Более 75 % матчей сборной с 1903 года завершались победой «Олл Блэкс» — по этому показателю национальная команда превосходит все остальные.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Abbreviation ("в т. ч.") - # - $test_string = <<'QUOTE'; -Топоры, в т. ч. транше и шлифованные. Дания. -QUOTE - - $expected_sentences = [ 'Топоры, в т. ч. транше и шлифованные.', 'Дания.' ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Abbreviation ("род.") - # - $test_string = <<'QUOTE'; -Влади́мир Влади́мирович Пу́тин (род. 7 октября 1952, Ленинград) — российский государственный -и политический деятель; действующий (четвёртый) президент Российской Федерации с 7 мая 2012 -года. Председатель Совета министров Союзного государства (с 2008 года). Второй президент -Российской Федерации с 7 мая 2000 года по 7 мая 2008 года (после отставки президента Бориса -Ельцина исполнял его обязанности с 31 декабря 1999 по 7 мая 2000 года). -QUOTE - - $expected_sentences = [ -'Влади́мир Влади́мирович Пу́тин (род. 7 октября 1952, Ленинград) — российский государственный и политический деятель; действующий (четвёртый) президент Российской Федерации с 7 мая 2012 года.', -'Председатель Совета министров Союзного государства (с 2008 года).', -'Второй президент Российской Федерации с 7 мая 2000 года по 7 мая 2008 года (после отставки президента Бориса Ельцина исполнял его обязанности с 31 декабря 1999 по 7 мая 2000 года).' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Name abbreviations - # - $test_string = <<'QUOTE'; -Впоследствии многие из тех, кто вместе с В. Путиным работал в мэрии Санкт-Петербурга (И. И. -Сечин, Д. А. Медведев, В. А. Зубков, А. Л. Кудрин, А. Б. Миллер, Г. О. Греф, Д. Н. Козак, -В. П. Иванов, С. Е. Нарышкин, В. Л. Мутко и др.) в 2000-е годы заняли ответственные посты -в правительстве России, администрации президента России и руководстве госкомпаний. -QUOTE - - $expected_sentences = [ -'Впоследствии многие из тех, кто вместе с В. Путиным работал в мэрии Санкт-Петербурга (И. И. Сечин, Д. А. Медведев, В. А. Зубков, А. Л. Кудрин, А. Б. Миллер, Г. О. Греф, Д. Н. Козак, В. П. Иванов, С. Е. Нарышкин, В. Л. Мутко и др.) в 2000-е годы заняли ответственные посты в правительстве России, администрации президента России и руководстве госкомпаний.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Date ("19.04.1953") - # - $test_string = <<'QUOTE'; -Род Моргенстейн (англ. Rod Morgenstein, род. 19.04.1953, Нью-Йорк) — американский барабанщик, -педагог. Он известен по работе с хеви-метал группой конца 80-х Winger и джаз-фьюжн группой -Dixie Dregs. Участвовал в сессионной работе с группами Fiona, Platypus и The Jelly Jam. В -настоящее время он профессор музыкального колледжа Беркли, преподаёт ударные инструменты. -QUOTE - - $expected_sentences = [ -'Род Моргенстейн (англ. Rod Morgenstein, род. 19.04.1953, Нью-Йорк) — американский барабанщик, педагог.', -'Он известен по работе с хеви-метал группой конца 80-х Winger и джаз-фьюжн группой Dixie Dregs.', - 'Участвовал в сессионной работе с группами Fiona, Platypus и The Jelly Jam.', -'В настоящее время он профессор музыкального колледжа Беркли, преподаёт ударные инструменты.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } -} - -sub test_tokenize() -{ - my $lang = MediaWords::Languages::ru->new(); - - # - # Word tokenizer - # - my $test_string = <<'QUOTE'; -Род Моргенстейн (англ. Rod Morgenstein, род. 19.04.1953, Нью-Йорк) — -американский барабанщик, педагог. Он известен по работе с хеви-метал группой -конца 80-х Winger и джаз-фьюжн группой Dixie Dregs. -QUOTE - - my $expected_words = [ - qw/ - род - моргенстейн - англ - rod - morgenstein - род - 19.04 - 1953 - нью-йорк - американский - барабанщик - педагог - он - известен - по - работе - с - хеви-метал - группой - конца - 80 - х - winger - и - джаз-фьюжн - группой - dixie - dregs/ - ]; - - { - is( join( '||', @{ $lang->split_sentence_to_words( $test_string ) } ), - join( '||', @{ $expected_words } ), "tokenize" ); - } -} - -sub test_stem() -{ - my $lang = MediaWords::Languages::ru->new(); - - # from http://ru.wikipedia.org/ - my $split_words = [ - 'сте́мминг', 'это', 'процесс', 'нахождения', - 'основы', 'слова', 'для', 'заданного', - 'исходного', 'слова', - ]; - - my $expected_stems = [ - 'сте́мминг', 'эт', 'процесс', 'нахожден', - 'основ', 'слов', 'для', 'зада', - 'исходн', 'слов' - ]; - - my $mw_stem_result = $lang->stem_words( $split_words ); - - is_deeply( ( join "_", @{ $mw_stem_result } ), ( join "_", @{ $expected_stems } ), "Stemmer compare test" ); - - is( $mw_stem_result->[ 0 ], lc $split_words->[ 0 ], "first word" ); -} - -sub main() -{ - # Test::More UTF-8 output - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - test_stopwords(); - test_split_text_to_sentences(); - test_tokenize(); - test_stem(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Languages/sv.t b/apps/common/tests/perl/MediaWords/Languages/sv.t deleted file mode 100644 index 08b70883df..0000000000 --- a/apps/common/tests/perl/MediaWords/Languages/sv.t +++ /dev/null @@ -1,96 +0,0 @@ -# -# Some test strings copied from Wikipedia (CC-BY-SA, http://creativecommons.org/licenses/by-sa/3.0/). -# - -use strict; -use warnings; - -use Readonly; - -use Test::NoWarnings; -use Test::More tests => 2 + 1; -use utf8; - -use MediaWords::Languages::sv; -use Data::Dumper; - -sub test_split_text_to_sentences() -{ - my $test_string; - my $expected_sentences; - - my $lang = MediaWords::Languages::sv->new(); - - # - # Simple paragraph - # - $test_string = <<'QUOTE'; -I sin ungdom studerade Lutosławski piano och komposition i Warszawa. Hans tidiga verk var påverkade av -polsk folkmusik. Han började utveckla sin karaktäristiska kompositionsteknik i slutet av 1950-talet. -Musiken från den här perioden och framåt inbegriper en egen metod att bygga harmonier av mindre grupper -av intervall. Den använder också slumpmässiga processer i vilka stämmornas rytmiska samordning inbegriper -ett moment av slumpmässighet. Hans kompositioner omfattar fyra symfonier, en konsert för orkester, flera -konserter för solo och orkester och orkestrala sångcykler. Efter andra världskriget bannlyste de -stalinistiska makthavarna hans kompositioner då de uppfattades som formalistiska och därmed tillgängliga -bara för en insatt elit, medan Lutosławski själv alltid motsatte sig den socialistiska realismen. Under -1980-talet utnyttjade Lutosławski sin internationella ryktbarhet för att stödja Solidaritet. -QUOTE - - $expected_sentences = [ - 'I sin ungdom studerade Lutosławski piano och komposition i Warszawa.', - 'Hans tidiga verk var påverkade av polsk folkmusik.', - 'Han började utveckla sin karaktäristiska kompositionsteknik i slutet av 1950-talet.', -'Musiken från den här perioden och framåt inbegriper en egen metod att bygga harmonier av mindre grupper av intervall.', -'Den använder också slumpmässiga processer i vilka stämmornas rytmiska samordning inbegriper ett moment av slumpmässighet.', -'Hans kompositioner omfattar fyra symfonier, en konsert för orkester, flera konserter för solo och orkester och orkestrala sångcykler.', -'Efter andra världskriget bannlyste de stalinistiska makthavarna hans kompositioner då de uppfattades som formalistiska och därmed tillgängliga bara för en insatt elit, medan Lutosławski själv alltid motsatte sig den socialistiska realismen.', - 'Under 1980-talet utnyttjade Lutosławski sin internationella ryktbarhet för att stödja Solidaritet.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Abbreviations ("f. Kr.", "a.C.n.", "A. D.") - # - $test_string = <<'QUOTE'; -Efter Kristus (förkortat e. Kr.) är den i modern svenska vanligtvis använda benämningen på Anno Domini -(latin för Herrens år), utförligare Anno Domini Nostri Iesu Christi (i vår Herres Jesu Kristi år), -oftast förkortat A. D. eller AD, vilket har varit den dominerande tideräkningsnumreringen av årtal i -modern tid i Europa. Årtalssystemet används fortfarande i hela västvärlden och i vetenskapliga och -kommersiella sammanhang även i resten av världen, när man anser att "efter" behöver förtydligas. Efter -den Gregorianska kalenderns införande har bruket att sätta ut AD vid årtalet stadigt minskat. -QUOTE - - $expected_sentences = [ -'Efter Kristus (förkortat e. Kr.) är den i modern svenska vanligtvis använda benämningen på Anno Domini (latin för Herrens år), utförligare Anno Domini Nostri Iesu Christi (i vår Herres Jesu Kristi år), oftast förkortat A. D. eller AD, vilket har varit den dominerande tideräkningsnumreringen av årtal i modern tid i Europa.', -'Årtalssystemet används fortfarande i hela västvärlden och i vetenskapliga och kommersiella sammanhang även i resten av världen, när man anser att "efter" behöver förtydligas.', - 'Efter den Gregorianska kalenderns införande har bruket att sätta ut AD vid årtalet stadigt minskat.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } -} - -sub main() -{ - # Test::More UTF-8 output - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - test_split_text_to_sentences(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Languages/tr.t b/apps/common/tests/perl/MediaWords/Languages/tr.t deleted file mode 100644 index 5418b4dc8c..0000000000 --- a/apps/common/tests/perl/MediaWords/Languages/tr.t +++ /dev/null @@ -1,121 +0,0 @@ -# -# Some test strings copied from Wikipedia (CC-BY-SA, http://creativecommons.org/licenses/by-sa/3.0/). -# - -use strict; -use warnings; - -use Readonly; - -use Test::NoWarnings; -use Test::More tests => 3 + 1; -use utf8; - -use MediaWords::Languages::tr; -use Data::Dumper; - -sub test_split_text_to_sentences() -{ - my $test_string; - my $expected_sentences; - - my $lang = MediaWords::Languages::tr->new(); - - # - # Simple paragraph - # - $test_string = <<'QUOTE'; -Google, (NASDAQ: GOOG), internet araması, çevrimiçi bilgi dağıtımı, reklam teknolojileri -ve arama motorları için yatırımlar yapan çok uluslu Amerikan anonim şirketidir. İnternet -tabanlı hizmet ve ürünler geliştirir, ek olarak bunlara ev sahipliği yapar. Kârının büyük -kısmını AdWords programı aracılığıyla reklamlardan elde etmektedir. Şirket, Larry Page ve -Sergey Brin tarafından, Stanford Üniversitesi'nde doktora öğrencisi oldukları sırada -kurulmuştur. İkili, sık sık "Google Guys" olarak anılmaktadır. Google, ilk olarak, 4 -Eylül 1998 tarihinde özel bir şirket olarak kuruldu ve 19 Ağustos 2004 tarihinde halka -arz edildi. Halka arzın gerçekleştiği dönemde, Larry Page, Sergey Brin ve Eric Schmidt, -takip eden yirmi yıl boyunca, yani 2024 yılına kadar Google'da birlikte çalışmak üzere -anlaştılar. Kuruluşundan bu yana misyonu "dünyadaki bilgiyi organize etmek ve bunu -evrensel olarak erişilebilir ve kullanılabilir hale getirmek"tir. Gayri resmi sloganı -ise, Google mühendisi Amit Patel tarafından bulunan ve Paul Buchheit tarafından -desteklenen "Don't be evil"dir. -QUOTE - - $expected_sentences = [ -'Google, (NASDAQ: GOOG), internet araması, çevrimiçi bilgi dağıtımı, reklam teknolojileri ve arama motorları için yatırımlar yapan çok uluslu Amerikan anonim şirketidir.', - 'İnternet tabanlı hizmet ve ürünler geliştirir, ek olarak bunlara ev sahipliği yapar.', - 'Kârının büyük kısmını AdWords programı aracılığıyla reklamlardan elde etmektedir.', -'Şirket, Larry Page ve Sergey Brin tarafından, Stanford Üniversitesi\'nde doktora öğrencisi oldukları sırada kurulmuştur.', - 'İkili, sık sık "Google Guys" olarak anılmaktadır.', -'Google, ilk olarak, 4 Eylül 1998 tarihinde özel bir şirket olarak kuruldu ve 19 Ağustos 2004 tarihinde halka arz edildi.', -'Halka arzın gerçekleştiği dönemde, Larry Page, Sergey Brin ve Eric Schmidt, takip eden yirmi yıl boyunca, yani 2024 yılına kadar Google\'da birlikte çalışmak üzere anlaştılar.', -'Kuruluşundan bu yana misyonu "dünyadaki bilgiyi organize etmek ve bunu evrensel olarak erişilebilir ve kullanılabilir hale getirmek"tir.', -'Gayri resmi sloganı ise, Google mühendisi Amit Patel tarafından bulunan ve Paul Buchheit tarafından desteklenen "Don\'t be evil"dir.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # URLS ("google.com", "google.co.in", ...), abbreviation ("vb.") - # - $test_string = <<'QUOTE'; -Alexa, internette en çok ziyaret edilen web sitesi olarak ABD odaklı "google.com"'u -listelemektedir, YouTube, Blogger, Orkut gibi Google'a ait diğer siteler ve çok -sayıda uluslararası Google sitesi (google.co.in, google.co.uk vb.) ise en çok -ziyaret edilen siteler arasında ilk yüz içinde yer almaktadır. Ek olarak şirket, -BrandZ marka değeri veritabanı listesinde ikinci sırada yer almaktadır. Buna karşın -Google, gizlilik, telif hakkı ve sansür gibi konularda eleştiriler almaktadır. -QUOTE - - $expected_sentences = [ -'Alexa, internette en çok ziyaret edilen web sitesi olarak ABD odaklı "google.com"\'u listelemektedir, YouTube, Blogger, Orkut gibi Google\'a ait diğer siteler ve çok sayıda uluslararası Google sitesi (google.co.in, google.co.uk vb.) ise en çok ziyaret edilen siteler arasında ilk yüz içinde yer almaktadır.', - 'Ek olarak şirket, BrandZ marka değeri veritabanı listesinde ikinci sırada yer almaktadır.', - 'Buna karşın Google, gizlilik, telif hakkı ve sansür gibi konularda eleştiriler almaktadır.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - # - # Period in the middle of the number - # - $test_string = <<'QUOTE'; -Bir yıl önceki rakam olan 931 milyon tekil ziyaretçi sayısındaki yüzde 8.4'lük bir -artışla, 2001 Mayıs ayında; Google'nin tekil ziyaretçi sayısı ilk kez 1 milyarı buldu. -QUOTE - - $expected_sentences = [ -'Bir yıl önceki rakam olan 931 milyon tekil ziyaretçi sayısındaki yüzde 8.4\'lük bir artışla, 2001 Mayıs ayında; Google\'nin tekil ziyaretçi sayısı ilk kez 1 milyarı buldu.' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } -} - -sub main() -{ - # Test::More UTF-8 output - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - test_split_text_to_sentences(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Languages/zh.t b/apps/common/tests/perl/MediaWords/Languages/zh.t deleted file mode 100644 index d6480f71ad..0000000000 --- a/apps/common/tests/perl/MediaWords/Languages/zh.t +++ /dev/null @@ -1,142 +0,0 @@ -# -# Some tests ported from languagetool ('ChineseSentenceTokenizerTest.java', 'ChineseWordTokenizerTest.java'): -# -# LanguageTool, a natural language style checker -# Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) -# -# This library is free software; you can redistribute it and/or -# modify it under the terms of the GNU Lesser General Public -# License as published by the Free Software Foundation; either -# version 2.1 of the License, or (at your option) any later version. -# -# This library is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# Lesser General Public License for more details. -# -# You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the Free Software -# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 -# USA -# -# Some test strings copied from Wikipedia (CC-BY-SA, http://creativecommons.org/licenses/by-sa/3.0/). -# - -use strict; -use warnings; - -use Readonly; - -use Test::NoWarnings; -use Test::More tests => 1 + 15 + 15 + 1 + 1; -use utf8; - -use MediaWords::Languages::zh; -use Data::Dumper; - -sub test_split_text_to_sentences() -{ - my $test_string; - my $expected_sentences; - - my $lang = MediaWords::Languages::zh->new(); - - # - # Simple sentence tokenizer test - # - # "Fanny Yi Muli , the illegitimate daughter of the British feminist Mary Wollstonecraft and - # American businessman Gilbert · Yi Muli . Fanny was born shortly Yi Muli took Wollstonecraft - # abandoned in the increasingly chaotic situation of the French Revolution . The frustrated - # love, Wollstonecraft and philosopher Godwin established a close relationship, and, - # ultimately, to marry him. In 1797, Wollstonecraft died of postnatal complications, left - # the three-year-old Fanny and freshmen Mary Worth through Kraft Godwin Godwin tending. Four - # years later, Godwin married a second wife, Fanny sisters do not like the new Mrs. Godwin. - # In 1814, the daughter of the young wife Mary Godwin , Claire Clairmont with runaways, go - # to the European continent and the Romantic poet Percy Bysshe Shelley . Fanny left alone - # to commit suicide in 1816, when he was 22 years old." - $test_string = <<'QUOTE'; -范妮·伊姆利,是英国女权主义者玛丽·沃斯通克拉夫特与美国商人吉尔伯特·伊姆利的私生女。 -在范妮出生不久,伊姆利便将沃斯通克拉夫特抛弃在了法国大革命日趋混乱的局势之中。 -在经历了这次失意的爱情后,沃斯通克拉夫特与哲学家戈德温建立了亲密的关系,并最终与他结婚。 -1797年,沃斯通克拉夫特死于产后并发症,将三岁的范妮与新生的玛丽·沃斯通克拉夫特·戈德温留给了戈德温一人抚育。 -四年后,戈德温与第二任妻子结婚,范妮姐妹俩都不喜欢新的戈德温太太。 -1814年,年少的玛丽与新戈德温太太带来的女儿克莱尔·克莱尔蒙特一同离家出走,并与浪漫主义诗人雪莱前往了欧洲大陆。 -独自留下的范妮于1816年服毒自杀,时年22岁。 -QUOTE - - $expected_sentences = [ -'范妮·伊姆利,是英国女权主义者玛丽·沃斯通克拉夫特与美国商人吉尔伯特·伊姆利的私生女。', -'在范妮出生不久,伊姆利便将沃斯通克拉夫特抛弃在了法国大革命日趋混乱的局势之中。', -'在经历了这次失意的爱情后,沃斯通克拉夫特与哲学家戈德温建立了亲密的关系,并最终与他结婚。', -'1797年,沃斯通克拉夫特死于产后并发症,将三岁的范妮与新生的玛丽·沃斯通克拉夫特·戈德温留给了戈德温一人抚育。', - '四年后,戈德温与第二任妻子结婚,范妮姐妹俩都不喜欢新的戈德温太太。', -'1814年,年少的玛丽与新戈德温太太带来的女儿克莱尔·克莱尔蒙特一同离家出走,并与浪漫主义诗人雪莱前往了欧洲大陆。', - '独自留下的范妮于1816年服毒自杀,时年22岁。' - ]; - - { - is( - join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), - join( '||', @{ $expected_sentences } ), - "sentence_split" - ); - } - - my $t1 = "他说:"; # "He said:" - my $t2 = "我们是中国人"; # "We are Chinese" - my $t3 = "中国人很好"; # "Chinese people are good" - - # - # Sentence tokenizer: test 1 - # - my @punctuation1 = ( '_', '/', ';', ':', '!', '@', '#', '$', '%', '^', '&', '.', '+', '*', '?' ); - foreach my $i ( @punctuation1 ) - { - my $test_string = $t2 . $i . $t3; - - # Text is a single sentence - is( join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), $test_string, "sentence_split" ); - } - - # - # Sentence tokenizer: test 2 - # - my @punctuation2 = ( "\x{ff0c}", "\x{ff1a}", "\x{2026}", "\x{ff01}", "\x{ff1f}", "\x{3001}", "\x{ff1b}", "\x{3002}" ); - foreach my $i ( @punctuation1 ) - { - my $test_string = $t2 . $i . $t3; - - # Text is a single sentence - is( join( '||', @{ $lang->split_text_to_sentences( $test_string ) } ), $test_string, "sentence_split" ); - } -} - -sub test_tokenize() -{ - my $lang = MediaWords::Languages::zh->new(); - - # - # Word tokenizer: test 1 - # - my $test_string = '主任强调指出错误的地方。'; - my $expected_words = [ '主任', '强调', '指出', '错误', '的', '地方' ]; - - { - is( join( '||', @{ $lang->split_sentence_to_words( $test_string ) } ), - join( '||', @{ $expected_words } ), "word_split" ); - } -} - -sub main() -{ - # Test::More UTF-8 output - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - test_split_text_to_sentences(); - test_tokenize(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Solr.t b/apps/common/tests/perl/MediaWords/Solr.t deleted file mode 100644 index 05f8063d4e..0000000000 --- a/apps/common/tests/perl/MediaWords/Solr.t +++ /dev/null @@ -1,261 +0,0 @@ -use strict; -use warnings; - -use MediaWords::CommonLibs; - -use English '-no_match_vars'; - -use Data::Dumper; -use Encode; -use Test::More; -use Test::Deep; - -BEGIN -{ - use_ok( 'MediaWords::Solr' ); -} - -use MediaWords::DB; -use MediaWords::Test::DB::Create; -use MediaWords::Test::Solr; -use MediaWords::Test::Rows; - -# run the given set of params against _gsifsop and verify that the given list of stories_ids (or undef) is returned -sub test_stories_id_query -{ - my ( $params, $expected_stories_ids, $label ) = @_; - - my $got_stories_ids = MediaWords::Solr::_get_stories_ids_from_stories_only_params( $params ); - - if ( $expected_stories_ids ) - { - ok( $got_stories_ids, "$label stories_ids defined" ); - return unless ( $got_stories_ids ); - - is( scalar( @{ $got_stories_ids } ), scalar( @{ $expected_stories_ids } ), "$label expected story count" ); - - my $got_story_lookup = {}; - map { $got_story_lookup->{ $_ } = 1 } @{ $got_stories_ids }; - - map { ok( $got_story_lookup->{ $_ }, "$label: expected stories_id $_" ) } @{ $expected_stories_ids }; - } - else - { - is( $got_stories_ids, undef, "$label: expected undef" ); - } -} - -sub test_solr_stories_only_query() -{ - test_stories_id_query( { q => '' }, undef, 'empty q' ); - test_stories_id_query( { fq => '' }, undef, 'empty fq' ); - test_stories_id_query( { q => '', fq => '' }, undef, 'empty q and fq' ); - test_stories_id_query( { q => '', fq => '' }, undef, 'empty q and fq' ); - - test_stories_id_query( { q => 'stories_id:1' }, [ 1 ], 'simple q match' ); - test_stories_id_query( { q => 'media_id:1' }, undef, 'simple q miss' ); - test_stories_id_query( { q => '*:*', fq => 'stories_id:1' }, [ 1 ], 'simple fq match' ); - test_stories_id_query( { q => '*:*', fq => 'media_id:1' }, undef, 'simple fq miss' ); - - test_stories_id_query( { q => 'media_id:1', fq => 'stories_id:1' }, undef, 'q hit / fq miss' ); - test_stories_id_query( { q => 'stories_id:1', fq => 'media_id:1' }, undef, 'q miss / fq hit' ); - - test_stories_id_query( { q => '*:*', fq => [ 'stories_id:1', 'stories_id:1' ] }, [ 1 ], 'fq list hit' ); - test_stories_id_query( { q => '*:*', fq => [ 'stories_id:1', 'media_id:1' ] }, undef, 'fq list miss' ); - - test_stories_id_query( { q => 'stories_id:1', fq => '' }, [ 1 ], 'q hit / empty fq' ); - test_stories_id_query( { q => 'stories_id:1', fq => [] }, [ 1 ], 'q hit / empty fq list' ); - test_stories_id_query( { q => '*:*', fq => 'stories_id:1' }, [ 1 ], '*:* q / fq hit' ); - test_stories_id_query( { fq => 'stories_id:1' }, undef, 'empty q, fq hit' ); - test_stories_id_query( { q => '*:*' }, undef, '*:* q' ); - - test_stories_id_query( { q => 'stories_id:( 1 2 3 )' }, [ 1, 2, 3 ], 'q list' ); - test_stories_id_query( - { q => 'stories_id:( 1 2 3 )', fq => 'stories_id:( 1 3 4 )' }, - [ 1, 3 ], - 'q list / fq list intersection' - ); - test_stories_id_query( { q => '( stories_id:2 )' }, [ 2 ], 'q parens' ); - test_stories_id_query( { q => '(stories_id:3)' }, [ 3 ], 'q parens no spaces' ); - - test_stories_id_query( { q => 'stories_id:4 and stories_id:4' }, [ 4 ], 'q simple and' ); - test_stories_id_query( { q => 'stories_id:( 1 2 3 ) and stories_id:( 2 3 4 )' }, [ 2, 3 ], 'q and intersection' ); - test_stories_id_query( { q => 'stories_id:( 1 2 3 ) and stories_id:( 4 5 6 )' }, [], 'q and empty intersection' ); - - test_stories_id_query( - { q => 'stories_id:( 1 2 3 4 ) and ( stories_id:( 2 3 4 5 6 ) and stories_id:( 3 4 ) )' }, - [ 3, 4 ], - 'q complex and intersection' - ); - test_stories_id_query( { q => 'stories_id:( 1 2 3 4 ) and ( stories_id:( 2 3 4 5 6 ) and media_id:1 )' }, - undef, 'q complex and intersection miss' ); - test_stories_id_query( { q => 'stories_id:( 1 2 3 4 ) and ( stories_id:( 2 3 4 5 6 ) and stories_id:( 243 ) )' }, - [], 'q complex and intersection empty' ); - test_stories_id_query( - { q => 'stories_id:( 1 2 3 4 ) and stories_id:( 2 3 4 5 6 ) and stories_id:( 3 4 )' }, - [ 3, 4 ], - 'q complex and intersection' - ); - - test_stories_id_query( { q => 'stories_id:1 and ( stories_id:2 and ( stories_id:3 and obama ) )' }, - undef, 'q complex boolean query with buried miss' ); - test_stories_id_query( { q => '( ( stories_id:1 or stories_id:2 ) and stories_id:3 )' }, - undef, 'q complex boolean query with buried or' ); - - test_stories_id_query( { q => 'stories_id:( 1 2 3 4 5 6 )', foo => 'bar' }, undef, 'unrecognized parameters' ); - test_stories_id_query( { q => 'stories_id:( 1 2 3 4 5 6 )', start => '2' }, [ 3, 4, 5, 6 ], 'start parameter' ); - test_stories_id_query( - { q => 'stories_id:( 1 2 3 4 5 6 )', start => '2', rows => 2 }, - [ 3, 4 ], - 'start and rows parameter' - ); - test_stories_id_query( { q => 'stories_id:( 1 2 3 4 5 6 )', rows => 2 }, [ 1, 2 ], 'rows parameter' ); -} - -# tests that require solr to be running -sub run_solr_tests($) -{ - my ( $db ) = @_; - - my $media = MediaWords::Test::Solr::create_indexed_test_story_stack( - $db, - { - medium_1 => { feed_1 => [ map { "story_$_" } ( 1 .. 15 ) ] }, - medium_2 => { feed_2 => [ map { "story_$_" } ( 16 .. 25 ) ] }, - medium_3 => { feed_3 => [ map { "story_$_" } ( 26 .. 50 ) ] }, - } - ); - - my $test_stories = $db->query( "select * from stories order by md5( stories_id::text )" )->hashes; - - { - # basic query - my $story = pop( @{ $test_stories } ); - MediaWords::Test::Solr::test_story_query( $db, '*:*', $story, 'simple story' ); - } - - { - # get_solr_num_found - my ( $expected_num_stories ) = $db->query( "select count(*) from stories" )->flat; - my $got_num_stories = MediaWords::Solr::get_solr_num_found( $db, { q => '*:*' } ); - is( $got_num_stories, $expected_num_stories, 'get_solr_num_found' ); - } - - { - # search_solr_for_processed_stories_ids - my $first_story = $db->query( <hash; -select * from processed_stories order by processed_stories_id asc limit 1 -SQL - - my $got_processed_stories_ids = MediaWords::Solr::search_solr_for_processed_stories_ids( $db, '*:*', undef, 0, 1 ); - is( scalar( @{ $got_processed_stories_ids } ), 1, "search_solr_for_processed_stories_ids count" ); - is( - $got_processed_stories_ids->[ 0 ], - $first_story->{ processed_stories_id }, - "search_solr_for_processed_stories_ids id" - ); - } - - { - # search_solr_for_stories_ids - my $story = pop( @{ $test_stories } ); - my $got_stories_ids = MediaWords::Solr::search_solr_for_stories_ids( $db, { q => "stories_id:$story->{ stories_id }" } ); - is_deeply( $got_stories_ids, [ $story->{ stories_id } ], "search_solr_for_stories_ids" ); - } - - { - eval { MediaWords::Solr::query_solr( $db, { q => "publish_date:[foo TO bar]" } ) }; - ok( $@ =~ /range queries are not allowed/i, "range queries not allowed: '$@'" ); - } -} - -sub test_collections_id_result($$$) -{ - my ( $db, $tags, $label ) = @_; - - my $tags_ids = [ map { $_->{ tags_id } } @{ $tags } ]; - - my ( $q_arg, $q_or_arg ); - if ( scalar( @{ $tags_ids } ) > 1 ) - { - $q_arg = "(" . join( ' ', @{ $tags_ids } ) . ")"; - $q_or_arg = "(" . join( ' or ', @{ $tags_ids } ) . ")"; - } - else - { - $q_arg = $tags_ids->[ 0 ]; - } - - my $expected_media_ids = []; - for my $tag ( @{ $tags } ) - { - for my $medium ( @{ $tag->{ media } } ) - { - push( @{ $expected_media_ids }, $medium->{ media_id } ); - } - } - - my $expected_q = 'media_id:(' . join( ' ', @{ $expected_media_ids } ) . ')'; - - my $got_q = MediaWords::Solr::_insert_collection_media_ids( $db, "tags_id_media:$q_arg" ); - is( $got_q, $expected_q, "$label (tags_id_media)" ); - - $got_q = MediaWords::Solr::_insert_collection_media_ids( $db, "collections_id:$q_arg" ); - is( $got_q, $expected_q, "$label (collections_id)" ); - - if ( $q_or_arg ) - { - $got_q = MediaWords::Solr::_insert_collection_media_ids( $db, "collections_id:$q_or_arg" ); - is( $got_q, $expected_q, "$label (collections_id with ors)" ); - } - -} - -sub test_collections_id_queries($) -{ - my ( $db ) = @_; - - my $num_tags = 10; - my $num_media_per_tag = 10; - - my $tag_set = $db->create( 'tag_sets', { name => 'test' } ); - - my $tags; - - for my $tag_i ( 1 .. $num_tags ) - { - my $tag = $db->create( 'tags', { tag_sets_id => $tag_set->{ tag_sets_id }, tag => "test_$tag_i" } ); - - $tag->{ media } = []; - for my $medium_i ( 1 .. $num_media_per_tag ) - { - my $medium = MediaWords::Test::DB::Create::create_test_medium( $db, "tag $tag_i medium $medium_i" ); - $db->query( <{ tags_id }, $medium->{ media_id } ); -insert into media_tags_map ( tags_id, media_id ) values ( ?, ? ) -SQL - push( @{ $tag->{ media } }, $medium ); - } - - push( @{ $tags }, $tag ); - } - - test_collections_id_result( $db, [ $tags->[ 0 ] ], "single tags_id" ); - test_collections_id_result( $db, $tags, "all tags" ); - test_collections_id_result( $db, [ $tags->[ 0 ], $tags->[ 1 ], $tags->[ 2 ] ], "three tags" ); - -} - -sub main -{ - my $db = MediaWords::DB::connect_to_db(); - - test_solr_stories_only_query(); - - test_collections_id_queries( $db ); - - run_solr_tests( $db ); - - done_testing(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Solr/Query.t b/apps/common/tests/perl/MediaWords/Solr/Query.t deleted file mode 100644 index c56f760bf7..0000000000 --- a/apps/common/tests/perl/MediaWords/Solr/Query.t +++ /dev/null @@ -1,93 +0,0 @@ -use strict; -use warnings; - -use Modern::Perl "2015"; -use MediaWords::CommonLibs; - -use MediaWords::DB; -use MediaWords::Solr::Query; -use MediaWords::Test::DB::Create; - -use Time::Piece; -use Test::More; - -# test that MediaWords::Solr::Query::get_full_solr_query_for_topic() returns the expected query -sub test_get_full_solr_query_for_topic($) -{ - my ( $db ) = @_; - - WARN( "BEGIN test_full_solr_query" ); - - MediaWords::Test::DB::Create::create_test_story_stack_numerated( $db, 10, 2, 2 ); - - # just need some randomly named tags, so copying media names works as well as anything - $db->query( "insert into tag_sets( name ) values ('foo' )" ); - - $db->query( "insert into tags ( tag, tag_sets_id ) select media.name, tag_sets_id from media, tag_sets" ); - - my $topic = MediaWords::Test::DB::Create::create_test_topic( $db, 'full solr query' ); - my $topics_id = $topic->{ topics_id }; - - $db->query( "insert into topics_media_map ( topics_id, media_id ) select ?, media_id from media limit 5", $topics_id ); - $db->query( "insert into topics_media_tags_map ( topics_id, tags_id ) select ?, tags_id from tags limit 5", $topics_id ); - - my $got_full_solr_query = MediaWords::Solr::Query::get_full_solr_query_for_topic( $db, $topic ); - - my @q_matches = - $got_full_solr_query->{ q } =~ /\( (.*) \) and \( media_id:\( ([\d\s]+) \) or tags_id_media:\( ([\d\s]+) \) \)/; - ok( @q_matches, "full solr query: q matches expected pattern: $got_full_solr_query->{ q }" ); - my ( $query, $media_ids_list, $tags_ids_list ) = @q_matches; - - my @fq_matches = $got_full_solr_query->{ fq } =~ - /publish_day\:\[(\d\d\d\d\-\d\d\-\d\d)T00:00:00Z TO (\d\d\d\d\-\d\d\-\d\d)T23:59:59Z\]/; - ok( @fq_matches, "full solr query: fq matches expected pattern: $got_full_solr_query->{ fq }" ); - my ( $start_date, $end_date ) = @fq_matches; - - is( $topic->{ solr_seed_query }, $query, "full solr query: solr_seed_query" ); - - is( $topic->{ start_date }, $start_date, "full solr query: start_date" ); - - my $tp_start = Time::Piece->strptime( $topic->{ start_date }, '%Y-%m-%d' ); - my $expected_end_date = $tp_start->add_months( 1 )->strftime( '%Y-%m-%d' ); - is( $end_date, $expected_end_date, "full solr query: end_date" ); - - my $got_media_ids_list = join( ',', sort( split( ' ', $media_ids_list ) ) ); - my $expected_media_ids = $db->query( "select media_id from topics_media_map where topics_id = ?", $topics_id )->flat; - my $expected_media_ids_list = join( ',', sort( @{ $expected_media_ids } ) ); - is( $got_media_ids_list, $expected_media_ids_list, "full solr query: media ids" ); - - my $got_tags_ids_list = join( ',', sort( split( ' ', $tags_ids_list ) ) ); - my $expected_tags_ids = $db->query( "select tags_id from topics_media_tags_map where topics_id = ?", $topics_id )->flat; - my $expected_tags_ids_list = join( ',', sort( @{ $expected_tags_ids } ) ); - is( $got_tags_ids_list, $expected_tags_ids_list, "full solr query: media ids" ); - - my $offset_full_solr_query = MediaWords::Solr::Query::get_full_solr_query_for_topic( $db, $topic, undef, undef, 1 ); - @fq_matches = $offset_full_solr_query->{ fq } =~ - /publish_day\:\[(\d\d\d\d\-\d\d\-\d\d)T00:00:00Z TO (\d\d\d\d\-\d\d\-\d\d)T23:59:59Z\]/; - - ok( @fq_matches, "offset solr query: matches expected pattern: $got_full_solr_query->{ fq }" ); - - my ( $offset_start_date, $offset_end_date ) = @fq_matches; - - $tp_start = Time::Piece->strptime( $topic->{ start_date }, '%Y-%m-%d' )->add_months( 1 ); - my $expected_start_date = $tp_start->strftime( '%Y-%m-%d' ); - is( $offset_start_date, $expected_start_date, "offset solr query: start_date" ); - - $expected_end_date = $tp_start->add_months( 1 )->strftime( '%Y-%m-%d' ); - is( $offset_end_date, $expected_end_date, "offset solr query: end_date" ); - - my $undef_full_solr_query = MediaWords::Solr::Query::get_full_solr_query_for_topic( $db, $topic, undef, undef, 3 ); - ok( !$undef_full_solr_query, "solr query offset beyond end date is undef" ); -} - -sub main() -{ - my $db = MediaWords::DB::connect_to_db(); - - test_get_full_solr_query_for_topic( $db ); - - done_testing(); -} - -main(); - diff --git a/apps/common/tests/perl/MediaWords/Test/HashServer.t b/apps/common/tests/perl/MediaWords/Test/HashServer.t deleted file mode 100644 index eb08b72510..0000000000 --- a/apps/common/tests/perl/MediaWords/Test/HashServer.t +++ /dev/null @@ -1,102 +0,0 @@ -use strict; -use warnings; -use utf8; - -use Test::More tests => 15; - -use MediaWords::Util::Web; -use MediaWords::Test::URLs; - -use Sys::Hostname; - -BEGIN -{ - use_ok( 'MediaWords::Test::HashServer' ); -} - -my $_port = 8899; - -# verify that a request for the given page on the test server returns the -# given content -sub __test_page -{ - my ( $url, $expected_content ) = @_; - - my $ua = MediaWords::Util::Web::UserAgent->new(); - my $content = $ua->get_string( $url ); - - chomp( $content ); - - is( $content, $expected_content, "test_page: $url" ); -} - -sub main -{ - my $pages = { - '/' => 'home', - '/foo' => 'foo', - '/bar' => '𝒃𝒂𝒓', # UTF-8 - '/foo-bar' => { redirect => '/bar' }, - '/localhost' => { redirect => "http://localhost:$_port/" }, - '/127-foo' => { redirect => "http://127.0.0.1:$_port/foo" }, - '/auth' => { auth => 'foo:bar', content => 'foo bar' }, - '/404' => { content => 'not found', http_status_code => 404 }, - '/callback' => { - callback => sub { - my ( $request ) = @_; - - my $params = $request->query_params(); - my $cookies = $request->cookies(); - - my $response = ''; - $response .= "HTTP/1.0 200 OK\r\n"; - $response .= "Content-Type: text/plain\r\n"; - $response .= "\r\n"; - $response .= "callback"; - return $response; - } - }, - }; - - my $hs = MediaWords::Test::HashServer->new( $_port, $pages ); - - ok( $hs, 'hashserver object returned' ); - - is_urls( $hs->page_url( '/foo' ), "http://" . Sys::Hostname::hostname . ":$_port/foo" ); - - $hs->start(); - - __test_page( "http://localhost:$_port/", 'home' ); - __test_page( "http://localhost:$_port/foo", 'foo' ); - __test_page( "http://localhost:$_port/bar", '𝒃𝒂𝒓' ); - __test_page( "http://localhost:$_port/foo-bar", '𝒃𝒂𝒓' ); - __test_page( "http://127.0.0.1:$_port/localhost", 'home' ); - __test_page( "http://localhost:$_port/127-foo", 'foo' ); - __test_page( "http://localhost:$_port/callback", 'callback' ); - - my $ua = MediaWords::Util::Web::UserAgent->new(); - - my $response_404 = $ua->get( "http://localhost:$_port/404" ); - ok( !$response_404->is_success, "404 response should not succeed" ); - is( $response_404->code, 404, "404 status line" ); - - my $auth_url = "http://localhost:$_port/auth"; - - my $content = $ua->get_string( $auth_url ); - is( $content, undef, 'fail auth / no auth' ); - - my $request = MediaWords::Util::Web::UserAgent::Request->new( 'GET', $auth_url ); - $request->set_authorization_basic( 'foo', 'bar' ); - my $response = $ua->request( $request ); - is( $response->decoded_content, 'foo bar', 'pass auth' ); - - $request = MediaWords::Util::Web::UserAgent::Request->new( 'GET', $auth_url ); - $request->set_authorization_basic( 'foo', 'foo' ); - $response = $ua->request( $request ); - - is( $response->code, 401, 'fail auth / bad password' ); - - $hs->stop(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Util/Compress.t b/apps/common/tests/perl/MediaWords/Util/Compress.t deleted file mode 100644 index e0728fd198..0000000000 --- a/apps/common/tests/perl/MediaWords/Util/Compress.t +++ /dev/null @@ -1,101 +0,0 @@ -use strict; -use warnings; - -use utf8; -use Test::NoWarnings; -use Test::More tests => 106; -use Readonly; - -use_ok( 'MediaWords::Util::Compress' ); - - -sub test_gzip_encode($) -{ - my $test_string = shift; - - my $gzipped_data = MediaWords::Util::Compress::gzip( $test_string ); - ok( length( $gzipped_data ), 'Length of gzipped data is non-zero' ); - isnt( $gzipped_data, $test_string, 'Gzipped data and source string differ' ); - my $gunzipped_data = MediaWords::Util::Compress::gunzip( $gzipped_data ); - is( $gunzipped_data, $test_string, 'Gunzipped data matches source string' ); -} - -sub test_bzip2_encode($) -{ - my $test_string = shift; - - my $bzip2ped_data = MediaWords::Util::Compress::bzip2( $test_string ); - ok( length( $bzip2ped_data ), 'Length of bzip2ped data is non-zero' ); - isnt( $bzip2ped_data, $test_string, 'Bzip2ped data and source string differ' ); - my $bunzip2ped_data = MediaWords::Util::Compress::bunzip2( $bzip2ped_data ); - is( $bunzip2ped_data, $test_string, 'Bunzip2ped data matches source string' ); -} - -sub test_wrong_algorithm($) -{ - my $test_string = shift; - - eval { MediaWords::Util::Compress::bunzip2( MediaWords::Util::Compress::gzip( $test_string ) ) }; - ok( $@, 'String compressed with Gzip, trying to uncompress with Bzip2' ); - eval { MediaWords::Util::Compress::gunzip( MediaWords::Util::Compress::bzip2( $test_string ) ) }; - ok( $@, 'String compressed with Bzip2, trying to uncompress with Gzip' ); -} - -sub test_bad_input() -{ - eval { MediaWords::Util::Compress::gzip( undef ) }; - ok( $@, 'Undefined input for gzip' ); - eval { MediaWords::Util::Compress::gunzip( undef ) }; - ok( $@, 'Undefined input for gunzip' ); - eval { MediaWords::Util::Compress::gunzip( '' ) }; - ok( $@, 'Empty input for gunzip' ); - eval { MediaWords::Util::Compress::gunzip( 'No way this is valid Gzip data' ) }; - ok( $@, 'Invalid input for gunzip' ); - - eval { MediaWords::Util::Compress::bzip2( undef ) }; - ok( $@, 'Undefined input for bzip2' ); - eval { MediaWords::Util::Compress::bunzip2( undef ) }; - ok( $@, 'Undefined input for bunzip2' ); - eval { MediaWords::Util::Compress::bunzip2( '' ) }; - ok( $@, 'Empty input for bunzip2' ); - eval { MediaWords::Util::Compress::bunzip2( 'No way this is valid Bzip2 data' ) }; - ok( $@, 'Invalid input for bunzip2' ); -} - -sub main() -{ - Readonly my @test_strings => ( - - # ASCII - "Media Cloud\r\nMedia Cloud\nMedia Cloud\r\n", - - # UTF-8 - "Media Cloud\r\nąčęėįšųūž\n您好\r\n", - - # Empty string - "", - - # Invalid UTF-8 sequences - "\xc3\x28", - "\xa0\xa1", - "\xe2\x28\xa1", - "\xe2\x82\x28", - "\xf0\x28\x8c\xbc", - "\xf0\x90\x28\xbc", - "\xf0\x28\x8c\x28", - "\xf8\xa1\xa1\xa1\xa1", - "\xfc\xa1\xa1\xa1\xa1\xa1", - - ); - - foreach my $test_string ( @test_strings ) - { - test_gzip_encode( $test_string ); - test_bzip2_encode( $test_string ); - test_wrong_algorithm( $test_string ); - } - - test_bad_input(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Util/DateTime.t b/apps/common/tests/perl/MediaWords/Util/DateTime.t deleted file mode 100644 index 34134418f5..0000000000 --- a/apps/common/tests/perl/MediaWords/Util/DateTime.t +++ /dev/null @@ -1,94 +0,0 @@ -use strict; -use warnings; - -use Modern::Perl "2015"; -use MediaWords::CommonLibs; - -use utf8; -use Test::NoWarnings; -use Test::More tests => 12; - -use MediaWords::Util::DateTime; -use DateTime; - -use_ok( 'MediaWords::Util::DateTime' ); - - -sub test_local_timezone() -{ - my $local_tz = MediaWords::Util::DateTime::local_timezone(); - isa_ok( $local_tz, 'DateTime::TimeZone' ); -} - -sub test_gmt_datetime_from_timestamp() -{ - my $timestamp; - my $datetime; - my $datetime_2; - - # Start of epoch - $timestamp = 0; - $datetime = MediaWords::Util::DateTime::gmt_datetime_from_timestamp( $timestamp ); - isa_ok( $datetime, 'DateTime' ); - is( DateTime->compare( $datetime, DateTime->from_epoch( epoch => 0 ) ), 0 ); - - # Some other date - $datetime_2 = DateTime->new( - year => 1969, - month => 7, - day => 24, - hour => 16, - minute => 50, - second => 35, - time_zone => 'UTC' - ); - $timestamp = $datetime_2->epoch; - $datetime = MediaWords::Util::DateTime::gmt_datetime_from_timestamp( $timestamp ); - isa_ok( $datetime, 'DateTime' ); - is( DateTime->compare( $datetime, $datetime_2 ), 0 ); - - # Some other timezone - $timestamp = DateTime->new( - year => 2017, - month => 3, - day => 3, - hour => 17, - minute => 0, - second => 0, - time_zone => 'America/New_York' - )->epoch; - - $datetime_2 = DateTime->new( # same date, different TZ - year => 2017, - month => 3, - day => 3, - hour => 22, - minute => 0, - second => 0, - time_zone => 'GMT' - ); - $datetime = MediaWords::Util::DateTime::gmt_datetime_from_timestamp( $timestamp ); - isa_ok( $datetime, 'DateTime' ); - is( DateTime->compare( $datetime, $datetime_2 ), 0 ); -} - -sub test_gmt_date_string_from_timestamp() -{ - is( MediaWords::Util::DateTime::gmt_date_string_from_timestamp( 0 ), '1970-01-01T00:00:00' ); - is( MediaWords::Util::DateTime::gmt_date_string_from_timestamp( -13849765 ), '1969-07-24T16:50:35' ); - is( MediaWords::Util::DateTime::gmt_date_string_from_timestamp( 637146000 ), '1990-03-11T09:00:00' ); -} - -sub main() -{ - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - test_local_timezone(); - test_gmt_datetime_from_timestamp(); - test_gmt_date_string_from_timestamp(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Util/GuessDate.t b/apps/common/tests/perl/MediaWords/Util/GuessDate.t deleted file mode 100644 index f7e86d24f4..0000000000 --- a/apps/common/tests/perl/MediaWords/Util/GuessDate.t +++ /dev/null @@ -1,139 +0,0 @@ -use strict; -use warnings; - -use Test::More tests => 17; -use Test::NoWarnings; -use Test::Deep; - -use utf8; - -use Modern::Perl "2015"; -use MediaWords::CommonLibs; -use Readonly; - -# Integer constants (in case str2time() fails) -Readonly my $TIMESTAMP_12_00_GMT => 1326801600; # Tue, 17 Jan 2012 12:00:00 GMT (UTC); for dates without time / timezone -Readonly my $TIMESTAMP_12_00_EST => 1326819600; # Tue, 17 Jan 2012 12:00:00 EST (-05:00) - -BEGIN { use_ok 'MediaWords::Util::GuessDate' } -BEGIN { use_ok 'MediaWords::Util::Web' } -BEGIN { use_ok 'MediaWords::Util::GuessDate::Result' } - -# Returns URL dating result -sub _gr($;$) -{ - my ( $html, $story_url ) = @_; - $story_url ||= 'http://www.example.com/story.html'; - - return MediaWords::Util::GuessDate::guess_date( $story_url, $html ); -} - -# Returns timestamp of the page or undef -sub _gt($;$) -{ - my ( $html, $story_url ) = @_; - - my $result = _gr( $html, $story_url ); - if ( $result->{ result } eq $MediaWords::Util::GuessDate::Result::FOUND ) - { - return $result->{ timestamp }; - } - else - { - return undef; - } -} - -# Returns dating result of the page; also fetches the URL -sub _gr_url($) -{ - my ( $story_url ) = @_; - - my $html = ''; - unless ( $story_url =~ /example\.(com|net|org)$/gi ) - { - - # 404 Not Found pages will be empty - my $ua = MediaWords::Util::Web::UserAgent->new(); - $html = $ua->get_string( $story_url ) || ''; - } - - return _gr( $html, $story_url ); -} - -sub test_dates() -{ - is( _gt( '' ), - $TIMESTAMP_12_00_EST, 'guess_by_og_article_published_time' ); - - is( _gt( '' ), $TIMESTAMP_12_00_GMT, 'guess_by_meta_pubdate' ); - - # LiveJournal - is( _gt( '' ), - $TIMESTAMP_12_00_EST, '_guess_by_abbr_published_updated_date' ); - -} - -sub test_not_found() -{ - is( - _gr_url( 'http://www.calchannel.com/proposition-36-three-strikes-law/' )->{ result }, - $MediaWords::Util::GuessDate::Result::NOT_FOUND, - '404 Not Found' - ); - is( _gr_url( 'http://www.15min.lt/////' )->{ result }, $MediaWords::Util::GuessDate::Result::NOT_FOUND, 'no path in URL' ); - is( - _gr_url( 'http://en.wikipedia.org/wiki/1980s_in_fashion' )->{ result }, - $MediaWords::Util::GuessDate::Result::NOT_FOUND, - 'Wikipedia URL' - ); - is( - _gr_url( 'https://twitter.com/ladygaga' )->{ result }, - $MediaWords::Util::GuessDate::Result::NOT_FOUND, - 'Twitter user URL' - ); - is( - _gr_url( -'https://www.facebook.com/notes/facebook-engineering/adding-face-to-every-ip-celebrating-ipv6s-one-year-anniversary/10151492544578920' - )->{ result }, - $MediaWords::Util::GuessDate::Result::NOT_FOUND, - 'Facebook URL' - ); - is( - _gr_url( 'http://vimeo.com/blog/archive/year:2013' )->{ result }, - $MediaWords::Util::GuessDate::Result::NOT_FOUND, - 'looks like URL of archive' - ); - is( - _gr_url( 'http://www.timesunion.com/news/crime/article/3-strikes-law-reformed-fewer-harsh-sentences-4013514.php' ) - ->{ result }, - $MediaWords::Util::GuessDate::Result::NOT_FOUND, - 'timesunion.com HTTP 404 Not Found' - ); - is( - _gr_url( 'http://www.seattlepi.com/news/crime/article/ACLU-challenges-human-trafficking-initiative-4018819.php' ) - ->{ result }, - $MediaWords::Util::GuessDate::Result::NOT_FOUND, - 'seattlepi.com HTTP 404 Not Found' - ); - is( - _gr_url( 'http://www.kgoam810.com/Article.asp?id=2569360&spid=' )->{ result }, - $MediaWords::Util::GuessDate::Result::NOT_FOUND, - 'kgoam810.com HTTP access denied' - ); -} - -sub main -{ - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - test_dates(); - test_not_found(); - - Test::NoWarnings::had_no_warnings(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Util/IdentifyLanguage.t b/apps/common/tests/perl/MediaWords/Util/IdentifyLanguage.t deleted file mode 100644 index 26602e93c9..0000000000 --- a/apps/common/tests/perl/MediaWords/Util/IdentifyLanguage.t +++ /dev/null @@ -1,101 +0,0 @@ -use strict; -use warnings; - -use utf8; - -use Test::NoWarnings; -use Test::More tests => 30; - -use Readonly; - -use_ok( 'MediaWords::Util::IdentifyLanguage' ); - -Readonly my $english_text => 'The quick brown fox jumps over the lazy dog.'; -Readonly my $russian_text => -'«Олл Блэкс» удерживали первую строчку в рейтинге сборных Международного совета регби дольше, чем все остальные команды вместе взятые.'; - -sub test_language_code_for_text() -{ - is( MediaWords::Util::IdentifyLanguage::language_code_for_text( $english_text ), - 'en', 'English text identified as English' ); - is( MediaWords::Util::IdentifyLanguage::language_code_for_text( $russian_text ), - 'ru', 'Russian text identified as Russian' ); - - is( MediaWords::Util::IdentifyLanguage::language_code_for_text( '' ), '', 'Empty text' ); - is( MediaWords::Util::IdentifyLanguage::language_code_for_text( undef ), '', 'Undefined text' ); - - ok( !MediaWords::Util::IdentifyLanguage::identification_would_be_reliable( '0000000000000000000000' ), 'Digits' ); - ok( !MediaWords::Util::IdentifyLanguage::identification_would_be_reliable( '000000000000000aaaaaaa' ), - 'More digits than letters' ); -} - -sub test_identification_would_be_reliable() -{ - ok( MediaWords::Util::IdentifyLanguage::identification_would_be_reliable( $english_text ), 'English text' ); - ok( MediaWords::Util::IdentifyLanguage::identification_would_be_reliable( $russian_text ), 'Russian text' ); - - ok( !MediaWords::Util::IdentifyLanguage::identification_would_be_reliable( undef ), 'Undef text' ); - ok( !MediaWords::Util::IdentifyLanguage::identification_would_be_reliable( '' ), 'Empty text' ); - ok( !MediaWords::Util::IdentifyLanguage::identification_would_be_reliable( 'abc' ), 'Too short text' ); - ok( !MediaWords::Util::IdentifyLanguage::identification_would_be_reliable( '______________________' ), 'Underscores' ); -} - -sub test_language_is_supported() -{ - ok( MediaWords::Util::IdentifyLanguage::language_is_supported( 'en' ), 'Supported language' ); - ok( !MediaWords::Util::IdentifyLanguage::language_is_supported( 'xx' ), 'Unsupported language' ); - - ok( !MediaWords::Util::IdentifyLanguage::language_is_supported( '' ), 'Empty language' ); - ok( !MediaWords::Util::IdentifyLanguage::language_is_supported( undef ), 'Undef language' ); -} - -sub test_utf8() -{ - Readonly my @test_strings => ( - - # UTF-8 - "Media Cloud\r\nąčęėįšųūž\n您好\r\n", - - # Invalid UTF-8 sequences - "\xc3\x28", - "\xa0\xa1", - "\xe2\x28\xa1", - "\xe2\x82\x28", - "\xf0\x28\x8c\xbc", - "\xf0\x90\x28\xbc", - "\xf0\x28\x8c\x28", - "\xf8\xa1\xa1\xa1\xa1", - "\xfc\xa1\xa1\xa1\xa1\xa1", - - ); - - foreach my $test_string ( @test_strings ) - { - eval { MediaWords::Util::IdentifyLanguage::language_code_for_text( $test_string ); }; - ok( !$@, "UTF-8 string: $test_string" ); - } -} - -sub test_very_long_string() -{ - my $very_long_string = 'a' x ( 1024 * 1024 * 10 ); # 10 MB of 'a' - ok( length( $very_long_string ) > 1024 * 1024 * 9 ); - eval { MediaWords::Util::IdentifyLanguage::language_code_for_text( $very_long_string ); }; - ok( !$@, "Very long string" ); -} - -sub main() -{ - my $builder = Test::More->builder; - binmode $builder->output, ':utf8'; - binmode $builder->failure_output, ':utf8'; - binmode $builder->todo_output, ':utf8'; - - test_language_code_for_text(); - test_identification_would_be_reliable(); - test_language_is_supported(); - test_utf8(); - test_very_long_string(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Util/Mail.t b/apps/common/tests/perl/MediaWords/Util/Mail.t deleted file mode 100644 index 5fc38eec7b..0000000000 --- a/apps/common/tests/perl/MediaWords/Util/Mail.t +++ /dev/null @@ -1,47 +0,0 @@ -use strict; -use warnings; - -use Test::More tests => 2; - -use Modern::Perl "2015"; -use MediaWords::CommonLibs; - -use MediaWords::Util::Mail; - -sub test_send_email() -{ - my $message = MediaWords::Util::Mail::Message->new( - { - to => 'nowhere@mediacloud.org', - cc => 'nowhere+cc@mediacloud.org', - bcc => 'nowhere+bcc@mediacloud.org', - subject => 'Hello!', - text_body => -'Text message 𝖜𝖎𝖙𝖍 𝖘𝖔𝖒𝖊 𝖀𝖓𝖎𝖈𝖔𝖉𝖊 𝖈𝖍𝖆𝖗𝖆𝖈𝖙𝖊𝖗𝖘.', - html_body => -'HTML message 𝖜𝖎𝖙𝖍 𝖘𝖔𝖒𝖊 𝖀𝖓𝖎𝖈𝖔𝖉𝖊 𝖈𝖍𝖆𝖗𝖆𝖈𝖙𝖊𝖗𝖘.', - } - ); - ok( MediaWords::Util::Mail::send_email( $message ) ); - -} - -sub test_send_text_email() -{ - my $to = 'nowhere@mediacloud.org'; - my $subject = 'Hello!'; - my $message = -'This is my message 𝖜𝖎𝖙𝖍 𝖘𝖔𝖒𝖊 𝖀𝖓𝖎𝖈𝖔𝖉𝖊 𝖈𝖍𝖆𝖗𝖆𝖈𝖙𝖊𝖗𝖘.'; - - ok( MediaWords::Util::Mail::send_text_email( $to, $subject, $message ) ); -} - -sub main() -{ - MediaWords::Util::Mail::enable_test_mode(); - - test_send_email(); - test_send_text_email(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Util/ParseHTML.t b/apps/common/tests/perl/MediaWords/Util/ParseHTML.t deleted file mode 100644 index 3033e43cca..0000000000 --- a/apps/common/tests/perl/MediaWords/Util/ParseHTML.t +++ /dev/null @@ -1,96 +0,0 @@ -use strict; -use warnings; -use utf8; - -use Test::More tests => 9; -use Test::NoWarnings; - -use Text::Trim; -use Test::Deep; -use MediaWords::Languages::en; - -use_ok( 'MediaWords::Util::ParseHTML' ); - -sub test_html_strip() -{ - my $input_html = <Hello! -EOF - my $expected_output = 'Hello!'; - my $actual_output = trim( MediaWords::Util::ParseHTML::html_strip( $input_html ) ); - is( $actual_output, $expected_output, 'html_strip()' ); -} - -sub test_html_title() -{ - { - my $input_html = <This is the title -EOF - my $fallback = undef; - my $expected_output = 'This is the title'; - my $actual_output = MediaWords::Util::ParseHTML::html_title( $input_html, $fallback ); - is( $actual_output, $expected_output, 'html_title() - basic test' ) - } - - { - my $input_html = ''; - my $fallback = undef; - my $expected_output = ''; - my $actual_output = MediaWords::Util::ParseHTML::html_title( $input_html, $fallback ); - is( $actual_output, $expected_output, 'html_title() - empty' ) - } - - { - my $input_html = <Title with
HTML tags -EOF - my $fallback = undef; - my $expected_output = 'Title with HTML tags'; - my $actual_output = MediaWords::Util::ParseHTML::html_title( $input_html, $fallback ); - is( $actual_output, $expected_output, 'html_title() - strip HTML' ) - } - - { - my $input_html = <Very very very very very long title -EOF - my $fallback = undef; - my $expected_output = 'Very'; - my $actual_output = MediaWords::Util::ParseHTML::html_title( $input_html, $fallback, 4 ); - is( $actual_output, $expected_output, 'html_title() - trimmed title' ) - } - - { - my $input_html = <...........home Title -EOF - my $fallback = undef; - my $expected_output = 'Title'; - my $actual_output = MediaWords::Util::ParseHTML::html_title( $input_html, $fallback ); - is( $actual_output, $expected_output, 'html_title() - _get_medium_title_from_response() exception' ) - } -} - -sub main() -{ - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - test_html_strip(); - test_html_title(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Util/ParseJSON.t b/apps/common/tests/perl/MediaWords/Util/ParseJSON.t deleted file mode 100644 index e77f558f9b..0000000000 --- a/apps/common/tests/perl/MediaWords/Util/ParseJSON.t +++ /dev/null @@ -1,54 +0,0 @@ -use strict; -use warnings; -use utf8; - -use Test::NoWarnings; -use Readonly; -use Test::More tests => 8; -use Test::Deep; -use Data::Dumper; - -use_ok( 'MediaWords::Util::ParseJSON' ); - -sub test_encode_decode_json() -{ - my $object = [ - 'foo' => { 'bar' => 'baz', }, - 'xyz' => 'zyx', - 'moo', - 'ąčęėįšųūž', - 42 - ]; - my $expected_json = '["foo",{"bar":"baz"},"xyz","zyx","moo","ąčęėįšųūž",42]'; - - my $pretty = 0; - my ( $encoded_json, $decoded_json ); - - $encoded_json = MediaWords::Util::ParseJSON::encode_json( $object, $pretty ); - is( $encoded_json, $expected_json, 'encode_json()' ); - $decoded_json = MediaWords::Util::ParseJSON::decode_json( $encoded_json ); - cmp_deeply( $decoded_json, $object, 'decode_json()' ); - - # Encoding errors - eval { MediaWords::Util::ParseJSON::encode_json( undef ); }; - ok( $@, 'Trying to encode undefined JSON' ); - eval { MediaWords::Util::ParseJSON::encode_json( "strings can't be encoded" ); }; - ok( $@, 'Trying to encode a string' ); - - eval { MediaWords::Util::ParseJSON::decode_json( undef ); }; - ok( $@, 'Trying to decode undefined JSON' ); - eval { MediaWords::Util::ParseJSON::decode_json( 'not JSON' ); }; - ok( $@, 'Trying to decode invalid JSON' ); -} - -sub main() -{ - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - test_encode_decode_json(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Util/Python.t b/apps/common/tests/perl/MediaWords/Util/Python.t deleted file mode 100644 index 030b4c505a..0000000000 --- a/apps/common/tests/perl/MediaWords/Util/Python.t +++ /dev/null @@ -1,68 +0,0 @@ -use strict; -use warnings; -use utf8; - -use Test::More tests => 17; -use Test::NoWarnings; -use Test::Deep; - -use Readonly; -use Data::Dumper; -use Inline::Python; - -use MediaWords::Util::Python; - -sub test_python_deep_copy() -{ - my $input = { - 'a' => undef, - 'b' => 1, - 'c' => 'd', - 'e' => [ - 'f' => { - 0 => $Inline::Python::Boolean::true, - 1 => $Inline::Python::Boolean::false, - } - ], - }; - my $expected_output = $input; - my $actual_output = python_deep_copy( $input ); - cmp_deeply( $actual_output, $expected_output ); - isnt( $input, $actual_output, 'References must be different' ); -} - -sub test_normalize_boolean_for_db() -{ - my $allow_null; - $allow_null = 0; - is( normalize_boolean_for_db( undef, $allow_null ), 'f' ); - $allow_null = 1; - is( normalize_boolean_for_db( undef, $allow_null ), undef ); - - is( normalize_boolean_for_db( 1 ), 't' ); - is( normalize_boolean_for_db( '1' ), 't' ); - is( normalize_boolean_for_db( 't' ), 't' ); - is( normalize_boolean_for_db( 'T' ), 't' ); - is( normalize_boolean_for_db( 'TRUE' ), 't' ); - is( normalize_boolean_for_db( $Inline::Python::Boolean::true ), 't' ); - - is( normalize_boolean_for_db( 0 ), 'f' ); - is( normalize_boolean_for_db( '0' ), 'f' ); - is( normalize_boolean_for_db( 'f' ), 'f' ); - is( normalize_boolean_for_db( 'F' ), 'f' ); - is( normalize_boolean_for_db( 'FALSE' ), 'f' ); - is( normalize_boolean_for_db( $Inline::Python::Boolean::false ), 'f' ); -} - -sub main() -{ - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - test_python_deep_copy(); - test_normalize_boolean_for_db(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Util/Text.t b/apps/common/tests/perl/MediaWords/Util/Text.t deleted file mode 100644 index 611e65b179..0000000000 --- a/apps/common/tests/perl/MediaWords/Util/Text.t +++ /dev/null @@ -1,101 +0,0 @@ -use strict; -use warnings; -use utf8; - -use Test::NoWarnings; -use Readonly; -use Test::More tests => 17; -use Test::Deep; - -use_ok( 'MediaWords::Util::Text' ); - - -sub test_encode_decode_utf8() -{ - Readonly my @test_strings => ( - - # ASCII - "Media Cloud\r\nMedia Cloud\nMedia Cloud\r\n", - - # UTF-8 - "Media Cloud\r\nąčęėįšųūž\n您好\r\n", - - # Empty string - "", - - # Invalid UTF-8 sequences - "\xc3\x28", - "\xa0\xa1", - "\xe2\x28\xa1", - "\xe2\x82\x28", - "\xf0\x28\x8c\xbc", - "\xf0\x90\x28\xbc", - "\xf0\x28\x8c\x28", - "\xf8\xa1\xa1\xa1\xa1", - "\xfc\xa1\xa1\xa1\xa1\xa1", - - ); - - foreach my $test_string ( @test_strings ) - { - my $encoded_string = MediaWords::Util::Text::encode_to_utf8( $test_string ); - my $decoded_string = MediaWords::Util::Text::decode_from_utf8( $encoded_string ); - is( $decoded_string, $test_string, "Encoded+decoded string matches" ); - } -} - -sub test_recursively_encode_to_utf8() -{ - my $ascii_string = 'Vazquez'; - my $not_encoded_string = "V\x{00}\x{e1}zquez"; - my $encoded_string = MediaWords::Util::Text::encode_to_utf8( $not_encoded_string ); - my $not_a_string = 42; - - my $input = [ - { - $ascii_string => $ascii_string, - $not_encoded_string => $not_encoded_string, - $not_a_string => $not_a_string - }, - [ $ascii_string, $not_encoded_string, $not_a_string, ], - $ascii_string, - $not_encoded_string, - $not_a_string, - ]; - - my $expected_output = [ - { - $ascii_string => $ascii_string, - $encoded_string => $encoded_string, - $not_a_string => $not_a_string - }, - [ $ascii_string, $encoded_string, $not_a_string, ], - $ascii_string, - $encoded_string, - $not_a_string, - ]; - - my $actual_output = MediaWords::Util::Text::recursively_encode_to_utf8( $input ); - - cmp_deeply( $actual_output, $expected_output, 'Structure got encoded successfully' ); -} - -sub test_is_valid_utf8() -{ - ok( MediaWords::Util::Text::is_valid_utf8( 'pnoןɔ ɐıpǝɯ' ), 'Valid UTF-8' ); - ok( !MediaWords::Util::Text::is_valid_utf8( "\xc3\x28" ), 'Invalid UTF-8' ); -} - -sub main() -{ - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - test_encode_decode_utf8(); - test_recursively_encode_to_utf8(); - test_is_valid_utf8(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Util/URL/Variants.t b/apps/common/tests/perl/MediaWords/Util/URL/Variants.t deleted file mode 100644 index c5e2f5cab0..0000000000 --- a/apps/common/tests/perl/MediaWords/Util/URL/Variants.t +++ /dev/null @@ -1,263 +0,0 @@ -use strict; -use warnings; -use utf8; - -use Modern::Perl "2015"; -use MediaWords::CommonLibs; - -use Test::NoWarnings; -use Test::Deep; -use Test::More tests => 15; - -use Readonly; -use HTTP::Status qw(:constants); -use Data::Dumper; - -use MediaWords::DB; -use MediaWords::Test::HashServer; -use MediaWords::Test::URLs; -use MediaWords::Util::URL::Variants; -use MediaWords::Test::DB::Create; - - -Readonly my $TEST_HTTP_SERVER_PORT => 9998; - -sub test_all_url_variants($) -{ - my ( $db ) = @_; - - my @actual_url_variants; - my @expected_url_variants; - - # Undefined URL - eval { MediaWords::Util::URL::Variants::all_url_variants( $db, undef ); }; - ok( $@, 'Undefined URL' ); - - # Non-HTTP(S) URL - Readonly my $gopher_url => 'gopher://gopher.floodgap.com/0/v2/vstat'; - @actual_url_variants = MediaWords::Util::URL::Variants::all_url_variants( $db, $gopher_url ); - @expected_url_variants = ( $gopher_url ); - is_deeply( [ sort @actual_url_variants ], [ sort @expected_url_variants ], 'Non-HTTP(S) URL' ); - - # Basic test - Readonly my $TEST_HTTP_SERVER_URL => 'http://localhost:' . $TEST_HTTP_SERVER_PORT; - Readonly my $starting_url_without_cruft => $TEST_HTTP_SERVER_URL . '/first'; - Readonly my $cruft => '?utm_source=A&utm_medium=B&utm_campaign=C'; - Readonly my $starting_url => $starting_url_without_cruft . $cruft; - - my $pages = { - '/first' => '', - '/second' => '', - '/third' => 'This is where the redirect chain should end.', - }; - - my $hs = MediaWords::Test::HashServer->new( $TEST_HTTP_SERVER_PORT, $pages ); - $hs->start(); - @actual_url_variants = MediaWords::Util::URL::Variants::all_url_variants( $db, $starting_url ); - $hs->stop(); - - @expected_url_variants = ( - $starting_url, $starting_url_without_cruft, - $TEST_HTTP_SERVER_URL . '/third', - $TEST_HTTP_SERVER_URL . '/third' . $cruft - ); - is_deeply( [ sort @actual_url_variants ], [ sort @expected_url_variants ], 'Basic all_url_variants() test' ); - - # - $pages = { - '/first' => '', - '/second' => '', - '/third' => '', - }; - - $hs = MediaWords::Test::HashServer->new( $TEST_HTTP_SERVER_PORT, $pages ); - $hs->start(); - @actual_url_variants = MediaWords::Util::URL::Variants::all_url_variants( $db, $starting_url ); - $hs->stop(); - - @expected_url_variants = ( - $starting_url, $starting_url_without_cruft, - $TEST_HTTP_SERVER_URL . '/third', - $TEST_HTTP_SERVER_URL . '/third' . $cruft, - $TEST_HTTP_SERVER_URL . '/fourth', - ); - is_deeply( - [ sort @actual_url_variants ], - [ sort @expected_url_variants ], - ' all_url_variants() test' - ); - - # Redirect to a homepage - $pages = { - '/first' => '', - '/second' => ' 'https://twitter.com/Todd__Kincannon/status/518499096974614529'; - @actual_url_variants = MediaWords::Util::URL::Variants::all_url_variants( $db, $invalid_url_variant ); - @expected_url_variants = ( $invalid_url_variant ); - is_deeply( - [ sort @actual_url_variants ], - [ sort @expected_url_variants ], - 'Invalid URL variant (suspended Twitter account)' - ); -} - -sub test_get_topic_url_variants -{ - my ( $db ) = @_; - - my $data = { - A => { - B => [ 1, 2, 3 ], - C => [ 4, 5, 6 ] - }, - D => { E => [ 7, 8, 9 ] } - }; - - my $media = MediaWords::Test::DB::Create::create_test_story_stack( $db, $data ); - - my $story_1 = $media->{ A }->{ feeds }->{ B }->{ stories }->{ 1 }; - my $story_2 = $media->{ A }->{ feeds }->{ B }->{ stories }->{ 2 }; - my $story_3 = $media->{ A }->{ feeds }->{ B }->{ stories }->{ 3 }; - my $story_4 = $media->{ A }->{ feeds }->{ C }->{ stories }->{ 4 }; - - $db->query( <{ stories_id }, $story_1->{ stories_id } ); -insert into topic_merged_stories_map ( source_stories_id, target_stories_id ) values( ?, ? ) -END - $db->query( <{ stories_id }, $story_2->{ stories_id } ); -insert into topic_merged_stories_map ( source_stories_id, target_stories_id ) values( ?, ? ) -END - - my $tag_set = $db->create( 'tag_sets', { name => 'foo' } ); - - my $topic = MediaWords::Test::DB::Create::create_test_topic( $db, 'foo' ); - - $db->create( - 'topic_stories', - { - topics_id => $topic->{ topics_id }, - stories_id => $story_4->{ stories_id } - } - ); - - $db->create( - 'topic_stories', - { - topics_id => $topic->{ topics_id }, - stories_id => $story_1->{ stories_id } - } - ); - - $db->create( - 'topic_links', - { - topics_id => $topic->{ topics_id }, - stories_id => $story_4->{ stories_id }, - ref_stories_id => $story_1->{ stories_id }, - url => $story_1->{ url }, - redirect_url => $story_1->{ url } . "/redirect_url" - } - ); - - $db->create( - 'topic_stories', - { - topics_id => $topic->{ topics_id }, - stories_id => $story_2->{ stories_id } - } - ); - - $db->create( - 'topic_links', - { - topics_id => $topic->{ topics_id }, - stories_id => $story_4->{ stories_id }, - ref_stories_id => $story_2->{ stories_id }, - url => $story_2->{ url }, - redirect_url => $story_2->{ url } . "/redirect_url" - } - ); - - $db->create( - 'topic_stories', - { - topics_id => $topic->{ topics_id }, - stories_id => $story_3->{ stories_id } - } - ); - - $db->create( - 'topic_links', - { - topics_id => $topic->{ topics_id }, - stories_id => $story_4->{ stories_id }, - ref_stories_id => $story_3->{ stories_id }, - url => $story_3->{ url } . '/alternate', - } - ); - - my $expected_urls = [ - $story_1->{ url }, - $story_1->{ url } . '?utm_source=A&utm_medium=B&utm_campaign=C', - $story_2->{ url }, - $story_1->{ url } . "/redirect_url", - $story_2->{ url } . "/redirect_url", - $story_3->{ url }, - $story_3->{ url } . "/alternate" - ]; - - my $test_url = $story_1->{ url } . '?utm_source=A&utm_medium=B&utm_campaign=C'; - my $url_variants = MediaWords::Util::URL::Variants::all_url_variants( $db, $test_url ); - - $url_variants = [ sort { $a cmp $b } @{ $url_variants } ]; - $expected_urls = [ sort { $a cmp $b } @{ $expected_urls } ]; - - is( scalar( @{ $url_variants } ), scalar( @{ $expected_urls } ), 'test_get_topic_url_variants: same number variants' ); - - for ( my $i = 0 ; $i < @{ $expected_urls } ; $i++ ) - { - is_urls( $url_variants->[ $i ], $expected_urls->[ $i ], 'test_get_topic_url_variants: url variant match $i' ); - } -} - -sub main() -{ - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - my $db = MediaWords::DB::connect_to_db(); - - test_all_url_variants( $db ); - test_all_url_variants_invalid_variants( $db ); - test_get_topic_url_variants( $db ); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Util/Web/UserAgent.t b/apps/common/tests/perl/MediaWords/Util/Web/UserAgent.t deleted file mode 100644 index 9c39a7c1d4..0000000000 --- a/apps/common/tests/perl/MediaWords/Util/Web/UserAgent.t +++ /dev/null @@ -1,1438 +0,0 @@ -use strict; -use warnings; -use utf8; - -use Modern::Perl "2015"; -use MediaWords::CommonLibs; - -use Test::Deep; -use Test::More tests => 139; - -use Encode; -use File::Temp qw/ tempdir tempfile /; -use File::Slurp; -use HTTP::Status qw(:constants); -use Readonly; -use Data::Dumper; -use URI; -use URI::Escape; -use URI::QueryParam; - -use MediaWords::Util::Config::Common; -use MediaWords::Util::ParseJSON; -use MediaWords::Util::Web; -use MediaWords::Util::Text; -use MediaWords::Test::HashServer; -use MediaWords::Test::URLs; - -my Readonly $TEST_HTTP_SERVER_PORT = 9998; -my Readonly $TEST_HTTP_SERVER_URL = 'http://localhost:' . $TEST_HTTP_SERVER_PORT; - -sub test_get() -{ - eval { - my $ua = MediaWords::Util::Web::UserAgent->new(); - $ua->get( undef ); - }; - ok( $@, 'Undefined URL' ); - - eval { - my $ua = MediaWords::Util::Web::UserAgent->new(); - $ua->get( 'gopher://gopher.floodgap.com/0/v2/vstat' ); - }; - ok( $@, 'Non-HTTP(S) URL' ); - - # Basic GET - my $pages = { '/test' => 'Hello!', }; - - my $hs = MediaWords::Test::HashServer->new( $TEST_HTTP_SERVER_PORT, $pages ); - $hs->start(); - - my $ua = MediaWords::Util::Web::UserAgent->new(); - my $response = $ua->get( "$TEST_HTTP_SERVER_URL/test" ); - - $hs->stop(); - - is_urls( $response->request()->url(), $TEST_HTTP_SERVER_URL . '/test' ); - is( $response->decoded_content(), 'Hello!' ); -} - -sub test_get_user_agent_from_headers() -{ - # User-Agent: and From: headers - my $pages = { - '/user-agent-from-headers' => { - callback => sub { - my ( $request ) = @_; - - my $response = ''; - - $response .= "HTTP/1.0 200 OK\r\n"; - $response .= "Content-Type: application/json; charset=UTF-8\r\n"; - $response .= "\r\n"; - $response .= MediaWords::Util::ParseJSON::encode_json( - { - 'user-agent' => $request->header( 'User-Agent' ), - 'from' => $request->header( 'From' ), - } - ); - - return $response; - } - } - }; - my $hs = MediaWords::Test::HashServer->new( $TEST_HTTP_SERVER_PORT, $pages ); - $hs->start(); - - my $ua = MediaWords::Util::Web::UserAgent->new(); - my $response = $ua->get( "$TEST_HTTP_SERVER_URL/user-agent-from-headers" ); - - $hs->stop(); - - ok( $response->is_success() ); - is_urls( $response->request()->url(), $TEST_HTTP_SERVER_URL . '/user-agent-from-headers' ); - - my $expected_user_agent = 'mediacloud bot for open academic research (http://mediacloud.org)'; - my $expected_from = 'info@mediacloud.org'; - - my $decoded_json = MediaWords::Util::ParseJSON::decode_json( $response->decoded_content() ); - cmp_deeply( - $decoded_json, - { - 'user-agent' => $expected_user_agent, - 'from' => $expected_from, - } - ); -} - -sub test_get_not_found() -{ - # HTTP redirects - my $pages = { - '/does-not-exist' => { - callback => sub { - my ( $request ) = @_; - - my $response = ''; - - $response .= "HTTP/1.0 404 Not Found\r\n"; - $response .= "Content-Type: text/html; charset=UTF-8\r\n"; - $response .= "\r\n"; - $response .= "I do not exist."; - - return $response; - } - } - }; - my $hs = MediaWords::Test::HashServer->new( $TEST_HTTP_SERVER_PORT, $pages ); - $hs->start(); - - my $ua = MediaWords::Util::Web::UserAgent->new(); - my $response = $ua->get( "$TEST_HTTP_SERVER_URL/does-not-exist" ); - - $hs->stop(); - - is_urls( $response->request()->url(), $TEST_HTTP_SERVER_URL . '/does-not-exist' ); - ok( !$response->is_success() ); - is( $response->decoded_content(), 'I do not exist.' ); -} - -sub test_get_valid_utf8_content() -{ - # Valid UTF-8 content - my $pages = { - '/valid-utf-8' => { - header => 'Content-Type: text/plain; charset=UTF-8', - content => '¡ollǝɥ', - }, - }; - - my $hs = MediaWords::Test::HashServer->new( $TEST_HTTP_SERVER_PORT, $pages ); - $hs->start(); - - my $ua = MediaWords::Util::Web::UserAgent->new(); - my $response = $ua->get( "$TEST_HTTP_SERVER_URL/valid-utf-8" ); - - $hs->stop(); - - is_urls( $response->request()->url(), $TEST_HTTP_SERVER_URL . '/valid-utf-8' ); - is( $response->decoded_content(), '¡ollǝɥ' ); -} - -sub test_get_invalid_utf8_content() -{ - # Invalid UTF-8 content - my $pages = { - '/invalid-utf-8' => { - header => 'Content-Type: text/plain; charset=UTF-8', - content => "\xf0\x90\x28\xbc", - }, - }; - - my $hs = MediaWords::Test::HashServer->new( $TEST_HTTP_SERVER_PORT, $pages ); - $hs->start(); - - my $ua = MediaWords::Util::Web::UserAgent->new(); - my $response = $ua->get( "$TEST_HTTP_SERVER_URL/invalid-utf-8" ); - - $hs->stop(); - - is_urls( $response->request()->url(), $TEST_HTTP_SERVER_URL . '/invalid-utf-8' ); - - # https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character - my $replacement_character = "\x{FFFD}"; - ok( - # OS X: - $response->decoded_content() eq "$replacement_character\x28$replacement_character" or - - # Ubuntu: - $response->decoded_content() eq "$replacement_character$replacement_character\x28$replacement_character" - ); -} - -sub test_get_non_utf8_content() -{ - # Non-UTF-8 content - use bytes; - - my $pages = { - '/non-utf-8' => { - header => 'Content-Type: text/plain; charset=iso-8859-13', - content => "\xd0auk\xf0tai po piet\xf8.", - }, - }; - - my $hs = MediaWords::Test::HashServer->new( $TEST_HTTP_SERVER_PORT, $pages ); - $hs->start(); - - my $ua = MediaWords::Util::Web::UserAgent->new(); - my $response = $ua->get( "$TEST_HTTP_SERVER_URL/non-utf-8" ); - - $hs->stop(); - - no bytes; - - is_urls( $response->request()->url(), $TEST_HTTP_SERVER_URL . '/non-utf-8' ); - is( $response->decoded_content(), 'Šaukštai po pietų.' ); -} - -sub test_get_max_size() -{ - my $test_content = MediaWords::Util::Text::random_string( 1024 * 10 ); - my $max_size = length( $test_content ) / 10; - my $pages = { '/max-download-side' => $test_content, }; - - my $hs = MediaWords::Test::HashServer->new( $TEST_HTTP_SERVER_PORT, $pages ); - $hs->start(); - - my $ua = MediaWords::Util::Web::UserAgent->new(); - $ua->set_max_size( $max_size ); - is( $ua->max_size(), $max_size ); - - my $response = $ua->get( "$TEST_HTTP_SERVER_URL/max-download-side" ); - - $hs->stop(); - - is_urls( $response->request()->url(), $TEST_HTTP_SERVER_URL . '/max-download-side' ); - - # LWP::UserAgent truncates the response but still reports it as successful - ok( $response->is_success() ); - ok( length( $response->decoded_content() ) >= $max_size ); - ok( length( $response->decoded_content() ) <= length( $test_content ) ); -} - -sub test_get_max_redirect() -{ - my $max_redirect = 3; - my $pages = { - '/1' => { redirect => '/2' }, - '/2' => { redirect => '/3' }, - '/3' => { redirect => '/4' }, - '/4' => { redirect => '/5' }, - '/5' => { redirect => '/6' }, - '/6' => { redirect => '/7' }, - '/7' => { redirect => '/8' }, - '/8' => "Shouldn't be able to get to this one.", - }; - - my $hs = MediaWords::Test::HashServer->new( $TEST_HTTP_SERVER_PORT, $pages ); - $hs->start(); - - my $ua = MediaWords::Util::Web::UserAgent->new(); - $ua->set_max_redirect( $max_redirect ); - is( $ua->max_redirect(), $max_redirect ); - - my $response = $ua->get( "$TEST_HTTP_SERVER_URL/1" ); - - $hs->stop(); - - ok( !$response->is_success() ); -} - -sub test_get_request_headers() -{ - my $pages = { - '/test-custom-header' => { - callback => sub { - my ( $request ) = @_; - - my $response = ''; - - $response .= "HTTP/1.0 200 OK\r\n"; - $response .= "Content-Type: application/json; charset=UTF-8\r\n"; - $response .= "\r\n"; - $response .= - MediaWords::Util::ParseJSON::encode_json( { 'custom-header' => $request->header( 'X-Custom-Header' ), } ); - - return $response; - } - } - }; - my $hs = MediaWords::Test::HashServer->new( $TEST_HTTP_SERVER_PORT, $pages ); - $hs->start(); - - my $ua = MediaWords::Util::Web::UserAgent->new(); - my $url = "$TEST_HTTP_SERVER_URL/test-custom-header"; - - my $request = MediaWords::Util::Web::UserAgent::Request->new( 'GET', $url ); - $request->set_header( 'X-Custom-Header', 'foo' ); - - my $response = $ua->request( $request ); - - ok( $response->is_success() ); - is_urls( $response->request()->url(), $TEST_HTTP_SERVER_URL . '/test-custom-header' ); - - my $decoded_json = MediaWords::Util::ParseJSON::decode_json( $response->decoded_content() ); - cmp_deeply( $decoded_json, { 'custom-header' => 'foo' } ); - - $hs->stop(); -} - -sub test_get_response_status() -{ - my $pages = { - '/test' => { - callback => sub { - my ( $request ) = @_; - - my $response = ''; - - $response .= "HTTP/1.0 418 Jestem czajniczek\r\n"; - $response .= "Content-Type: text/html; charset=UTF-8\r\n"; - $response .= "\r\n"; - $response .= "☕"; - - return $response; - } - } - }; - - my $hs = MediaWords::Test::HashServer->new( $TEST_HTTP_SERVER_PORT, $pages ); - $hs->start(); - - my $ua = MediaWords::Util::Web::UserAgent->new(); - my $response = $ua->get( "$TEST_HTTP_SERVER_URL/test" ); - - $hs->stop(); - - is_urls( $response->request()->url(), $TEST_HTTP_SERVER_URL . '/test' ); - is( $response->decoded_content(), '☕' ); - - # HTTP status cod and message - is( $response->code(), 418 ); - is( $response->message(), 'Jestem czajniczek' ); - is( $response->status_line(), '418 Jestem czajniczek' ); -} - -sub test_get_response_headers() -{ - my $pages = { - '/test' => { - header => "Content-Type: text/plain; charset=UTF-8\r\nX-Media-Cloud: mediacloud", - content => "pnolɔ ɐıpǝɯ", - } - }; - - my $hs = MediaWords::Test::HashServer->new( $TEST_HTTP_SERVER_PORT, $pages ); - $hs->start(); - - my $ua = MediaWords::Util::Web::UserAgent->new(); - my $response = $ua->get( "$TEST_HTTP_SERVER_URL/test" ); - - $hs->stop(); - - is_urls( $response->request()->url(), $TEST_HTTP_SERVER_URL . '/test' ); - is( $response->decoded_content(), 'pnolɔ ɐıpǝɯ' ); - - # Uppercase / lowercase headers - is( $response->header( 'X-Media-Cloud' ), 'mediacloud' ); - is( $response->header( 'x-media-cloud' ), 'mediacloud' ); -} - -sub test_get_response_content_type() -{ - my $pages = { - '/test' => { - header => "Content-Type: application/xhtml+xml; charset=UTF-8", - content => "pnolɔ ɐıpǝɯ", - } - }; - - my $hs = MediaWords::Test::HashServer->new( $TEST_HTTP_SERVER_PORT, $pages ); - $hs->start(); - - my $ua = MediaWords::Util::Web::UserAgent->new(); - my $response = $ua->get( "$TEST_HTTP_SERVER_URL/test" ); - - $hs->stop(); - - is_urls( $response->request()->url(), $TEST_HTTP_SERVER_URL . '/test' ); - is( $response->decoded_content(), 'pnolɔ ɐıpǝɯ' ); - - is( $response->content_type(), 'application/xhtml+xml' ); -} - -sub test_get_blacklisted_url() -{ - my $tempdir = tempdir( CLEANUP => 1 ); - ok( -e $tempdir ); - - my $whitelist_temp_file = $tempdir . '/whitelisted_url_opened.txt'; - my $blacklist_temp_file = $tempdir . '/blacklisted_url_opened.txt'; - ok( !-e $whitelist_temp_file ); - ok( !-e $blacklist_temp_file ); - - my $pages = { - '/whitelisted' => { - callback => sub { - my ( $request ) = @_; - - open( my $fh, '>', $whitelist_temp_file ); - print $fh "Whitelisted URL has been fetched."; - close $fh; - - my $response = ''; - - $response .= "HTTP/1.0 200 OK\r\n"; - $response .= "Content-Type: text/plain\r\n"; - $response .= "\r\n"; - $response .= "Whitelisted page (should be fetched)."; - - return $response; - } - }, - '/blacklisted' => { - callback => sub { - my ( $request ) = @_; - - open( my $fh, '>', $blacklist_temp_file ); - print $fh "Blacklisted URL has been fetched."; - close $fh; - - my $response = ''; - - $response .= "HTTP/1.0 200 OK\r\n"; - $response .= "Content-Type: text/plain\r\n"; - $response .= "\r\n"; - $response .= "Blacklisted page (should not be fetched)."; - - return $response; - } - }, - }; - - my $whitelisted_url = $TEST_HTTP_SERVER_URL . "/whitelisted"; - my $blacklisted_url = $TEST_HTTP_SERVER_URL . "/blacklisted"; - - { - package BlacklistedURLUserAgentConfig; - - use strict; - use warnings; - - use base 'MediaWords::Util::Config::Common::UserAgent'; - - sub blacklist_url_pattern() - { - return "$blacklisted_url"; - } - - 1; - } - - my $default_ua_config = MediaWords::Util::Config::Common::user_agent(); - my $blacklisted_url_ua_config = BlacklistedURLUserAgentConfig->new( $default_ua_config ); - my $ua = MediaWords::Util::Web::UserAgent->new( $blacklisted_url_ua_config ); - - my $hs = MediaWords::Test::HashServer->new( $TEST_HTTP_SERVER_PORT, $pages ); - $hs->start(); - - my $blacklisted_response = $ua->get( $blacklisted_url ); - my $whitelisted_response = $ua->get( $whitelisted_url ); - - $hs->stop(); - - ok( !$blacklisted_response->is_success() ); - ok( $blacklisted_response->error_is_client_side() ); - isnt_urls( $blacklisted_response->request()->url(), $blacklisted_url ); - - ok( $whitelisted_response->is_success() ); - is_urls( $whitelisted_response->request()->url(), $whitelisted_url ); - - ok( -e $whitelist_temp_file ); - ok( !-e $blacklist_temp_file ); -} - -sub test_get_http_auth() -{ - my $pages = { - '/auth' => { - auth => 'username1:password2', - content => 'Authenticated!', - } - }; - - my $hs = MediaWords::Test::HashServer->new( $TEST_HTTP_SERVER_PORT, $pages ); - $hs->start(); - - my $ua = MediaWords::Util::Web::UserAgent->new(); - - { - # No auth - my $no_auth_url = $TEST_HTTP_SERVER_URL . "/auth"; - my $no_auth_response = $ua->get( $no_auth_url ); - ok( !$no_auth_response->is_success() ); - is( $no_auth_response->code(), HTTP_UNAUTHORIZED ); - } - - { - # Invalid auth in URL - my $invalid_auth_url = - 'http://incorrect_username1:incorrect_password2@localhost:' . $TEST_HTTP_SERVER_PORT . "/auth"; - my $invalid_auth_response = $ua->get( $invalid_auth_url ); - ok( !$invalid_auth_response->is_success() ); - is( $invalid_auth_response->code(), HTTP_UNAUTHORIZED ); - } - - { - # Valid auth in URL - my $valid_auth_url = 'http://username1:password2@localhost:' . $TEST_HTTP_SERVER_PORT . "/auth"; - my $valid_auth_response = $ua->get( $valid_auth_url ); - ok( $valid_auth_response->is_success() ); - is( $valid_auth_response->code(), HTTP_OK ); - is( $valid_auth_response->decoded_content(), 'Authenticated!' ); - } - - my $base_auth_url = $TEST_HTTP_SERVER_URL . "/auth"; - - { - # Invalid auth in request - my $invalid_auth_request = MediaWords::Util::Web::UserAgent::Request->new( 'GET', $base_auth_url ); - $invalid_auth_request->set_authorization_basic( 'incorrect_username1', 'incorrect_password2' ); - my $invalid_auth_response = $ua->request( $invalid_auth_request ); - ok( !$invalid_auth_response->is_success() ); - is( $invalid_auth_response->code(), HTTP_UNAUTHORIZED ); - } - - { - # Valid auth in request - my $valid_auth_request = MediaWords::Util::Web::UserAgent::Request->new( 'GET', $base_auth_url ); - $valid_auth_request->set_authorization_basic( 'username1', 'password2' ); - my $valid_auth_response = $ua->request( $valid_auth_request ); - ok( $valid_auth_response->is_success() ); - is( $valid_auth_response->code(), HTTP_OK ); - is( $valid_auth_response->decoded_content(), 'Authenticated!' ); - } - - $hs->stop(); -} - -sub test_get_authenticated_domains() -{ - # This is what get_url_distinctive_domain() returns for whatever reason - my $domain = 'localhost.localhost'; - my $username = 'username1'; - my $password = 'password2'; - - my $pages = { - '/auth' => { - auth => "$username:$password", - content => 'Authenticated!', - } - }; - - my $hs = MediaWords::Test::HashServer->new( $TEST_HTTP_SERVER_PORT, $pages ); - $hs->start(); - - my $base_auth_url = $TEST_HTTP_SERVER_URL . "/auth"; - - { - { - package NoAuthUserAgentConfig; - - use strict; - use warnings; - - use base 'MediaWords::Util::Config::Common::UserAgent'; - - sub authenticated_domains() - { - return []; - } - - 1; - } - - my $default_ua_config = MediaWords::Util::Config::Common::user_agent(); - my $no_auth_ua_config = NoAuthUserAgentConfig->new( $default_ua_config ); - my $ua = MediaWords::Util::Web::UserAgent->new( $no_auth_ua_config ); - - my $no_auth_response = $ua->get( $base_auth_url ); - ok( !$no_auth_response->is_success() ); - is( $no_auth_response->code(), HTTP_UNAUTHORIZED ); - } - - { - { - package IncorrectDomain; - - use strict; - use warnings; - - sub new($) - { - my ( $class ) = @_; - - my $self = {}; - bless $self, $class; - - return $self; - } - - sub domain() { return $domain; } - sub username() { return 'incorrect_username1'; } - sub password() { return 'incorrect_password2'; } - - 1; - } - - { - package IncorrectAuthUserAgentConfig; - - use strict; - use warnings; - - use base 'MediaWords::Util::Config::Common::UserAgent'; - - sub authenticated_domains() { return [ IncorrectDomain->new() ]; } - - 1; - } - - my $default_ua_config = MediaWords::Util::Config::Common::user_agent(); - my $incorrect_auth_ua_config = IncorrectAuthUserAgentConfig->new( $default_ua_config ); - my $ua = MediaWords::Util::Web::UserAgent->new( $incorrect_auth_ua_config ); - - my $invalid_auth_response = $ua->get( $base_auth_url ); - ok( !$invalid_auth_response->is_success() ); - is( $invalid_auth_response->code(), HTTP_UNAUTHORIZED ); - } - - { - { - package CorrectDomain; - - use strict; - use warnings; - - sub new($) - { - my ( $class ) = @_; - - my $self = {}; - bless $self, $class; - - return $self; - } - - sub domain() { return $domain; } - sub username() { return $username; } - sub password() { return $password; } - - 1; - } - - { - package CorrectAuthUserAgentConfig; - - use strict; - use warnings; - - use base 'MediaWords::Util::Config::Common::UserAgent'; - - sub authenticated_domains() { return [ CorrectDomain->new() ]; } - - 1; - } - - my $default_ua_config = MediaWords::Util::Config::Common::user_agent(); - my $correct_auth_ua_config = CorrectAuthUserAgentConfig->new( $default_ua_config ); - my $ua = MediaWords::Util::Web::UserAgent->new( $correct_auth_ua_config ); - - my $valid_auth_response = $ua->get( $base_auth_url ); - ok( $valid_auth_response->is_success() ); - is( $valid_auth_response->code(), HTTP_OK ); - is( $valid_auth_response->decoded_content(), 'Authenticated!' ); - } - - $hs->stop(); -} - -sub test_get_follow_http_html_redirects_http() -{ - my $ua = MediaWords::Util::Web::UserAgent->new(); - - eval { $ua->get_follow_http_html_redirects( undef ); }; - ok( $@, 'Undefined URL' ); - - eval { $ua->get_follow_http_html_redirects( 'gopher://gopher.floodgap.com/0/v2/vstat' ); }; - ok( $@, 'Non-HTTP(S) URL' ); - - my $starting_url = $TEST_HTTP_SERVER_URL . '/first'; - - # HTTP redirects - my $pages = { - '/first' => { redirect => '/second', http_status_code => HTTP_MOVED_PERMANENTLY }, - '/second' => { redirect => $TEST_HTTP_SERVER_URL . '/third', http_status_code => HTTP_FOUND }, - '/third' => { redirect => '/fourth', http_status_code => HTTP_SEE_OTHER }, - '/fourth' => { redirect => $TEST_HTTP_SERVER_URL . '/fifth', http_status_code => HTTP_TEMPORARY_REDIRECT }, - '/fifth' => 'Seems to be working.' - }; - - my $hs = MediaWords::Test::HashServer->new( $TEST_HTTP_SERVER_PORT, $pages ); - $hs->start(); - - my $response = $ua->get_follow_http_html_redirects( $starting_url ); - - $hs->stop(); - - is_urls( $response->request()->url(), $TEST_HTTP_SERVER_URL . '/fifth', 'URL after HTTP redirects' ); - is( $response->decoded_content(), $pages->{ '/fifth' }, 'Data after HTTP redirects' ); -} - -sub test_get_follow_http_html_redirects_nonexistent() -{ - Readonly my $TEST_HTTP_SERVER_URL => 'http://localhost:' . $TEST_HTTP_SERVER_PORT; - my $starting_url = $TEST_HTTP_SERVER_URL . '/first'; - - # Nonexistent URL ("/first") - my $pages = {}; - - my $hs = MediaWords::Test::HashServer->new( $TEST_HTTP_SERVER_PORT, $pages ); - $hs->start(); - - my $ua = MediaWords::Util::Web::UserAgent->new(); - my $response = $ua->get_follow_http_html_redirects( $starting_url ); - - $hs->stop(); - - ok( !$response->is_success ); - is_urls( $response->request()->url(), $starting_url, 'URL after unsuccessful HTTP redirects' ); -} - -sub test_get_follow_http_html_redirects_html() -{ - Readonly my $TEST_HTTP_SERVER_URL => 'http://localhost:' . $TEST_HTTP_SERVER_PORT; - my $starting_url = $TEST_HTTP_SERVER_URL . '/first'; - - # HTML redirects - my $pages = { - '/first' => '', - '/second' => '', - '/third' => '', - '/fourth' => '< meta content="url=fifth" http-equiv="refresh" >', - '/fifth' => 'Seems to be working too.' - }; - - my $hs = MediaWords::Test::HashServer->new( $TEST_HTTP_SERVER_PORT, $pages ); - $hs->start(); - - my $ua = MediaWords::Util::Web::UserAgent->new(); - my $response = $ua->get_follow_http_html_redirects( $starting_url ); - - $hs->stop(); - - is_urls( $response->request()->url(), $TEST_HTTP_SERVER_URL . '/fifth', 'URL after HTML redirects' ); - is( $response->decoded_content(), $pages->{ '/fifth' }, 'Data after HTML redirects' ); -} - -sub test_get_follow_http_html_redirects_http_loop() -{ - Readonly my $TEST_HTTP_SERVER_URL => 'http://localhost:' . $TEST_HTTP_SERVER_PORT; - my $starting_url = $TEST_HTTP_SERVER_URL . '/first'; - - # "http://127.0.0.1:9998/third?url=http%3A%2F%2F127.0.0.1%2Fsecond" - my $third = '/third?url=' . uri_escape( $TEST_HTTP_SERVER_URL . '/second' ); - - # HTTP redirects - my $pages = { - -# e.g. http://rss.nytimes.com/c/34625/f/640350/s/3a08a24a/sc/1/l/0L0Snytimes0N0C20A140C0A50C0A40Cus0Cpolitics0Cobama0Ewhite0Ehouse0Ecorrespondents0Edinner0Bhtml0Dpartner0Frss0Gemc0Frss/story01.htm - '/first' => { redirect => '/second', http_status_code => HTTP_SEE_OTHER }, - - # e.g. http://www.nytimes.com/2014/05/04/us/politics/obama-white-house-correspondents-dinner.html?partner=rss&emc=rss - '/second' => { redirect => $third, http_status_code => HTTP_SEE_OTHER }, - -# e.g. http://www.nytimes.com/glogin?URI=http%3A%2F%2Fwww.nytimes.com%2F2014%2F05%2F04%2Fus%2Fpolitics%2Fobama-white-house-correspondents-dinner.html%3Fpartner%3Drss%26emc%3Drss - '/third' => { redirect => '/second', http_status_code => HTTP_SEE_OTHER } - }; - - my $hs = MediaWords::Test::HashServer->new( $TEST_HTTP_SERVER_PORT, $pages ); - $hs->start(); - - my $ua = MediaWords::Util::Web::UserAgent->new(); - my $response = $ua->get_follow_http_html_redirects( $starting_url ); - - $hs->stop(); - - is_urls( $response->request()->url(), $TEST_HTTP_SERVER_URL . '/second', 'URL after HTTP redirect loop' ); -} - -sub test_get_follow_http_html_redirects_html_loop() -{ - Readonly my $TEST_HTTP_SERVER_URL => 'http://localhost:' . $TEST_HTTP_SERVER_PORT; - my $starting_url = $TEST_HTTP_SERVER_URL . '/first'; - - # HTML redirects - my $pages = { - '/first' => '', - '/second' => '', - '/third' => '', - }; - - my $hs = MediaWords::Test::HashServer->new( $TEST_HTTP_SERVER_PORT, $pages ); - $hs->start(); - - my $ua = MediaWords::Util::Web::UserAgent->new(); - my $response = $ua->get_follow_http_html_redirects( $starting_url ); - - $hs->stop(); - - is_urls( $response->request()->url(), $TEST_HTTP_SERVER_URL . '/first', 'URL after HTML redirect loop' ); -} - -# Test if the subroutine acts nicely when the server decides to ensure that the -# client supports cookies (e.g. -# http://www.dailytelegraph.com.au/news/world/charlie-hebdo-attack-police-close-in-on-two-armed-massacre-suspects-as-manhunt-continues-across-france/story-fni0xs63-1227178925700) -sub test_get_follow_http_html_redirects_cookies() -{ - Readonly my $TEST_HTTP_SERVER_URL => 'http://localhost:' . $TEST_HTTP_SERVER_PORT; - my $starting_url = $TEST_HTTP_SERVER_URL . '/first'; - Readonly my $TEST_CONTENT => 'This is the content.'; - - Readonly my $COOKIE_NAME => "test_cookie"; - Readonly my $COOKIE_VALUE => "I'm a cookie and I know it!"; - Readonly my $DEFAULT_HEADER => "Content-Type: text/html; charset=UTF-8"; - - # HTTP redirects - my $pages = { - '/first' => { - callback => sub { - my ( $request ) = @_; - - my $cookies = $request->cookies(); - - my $received_cookie = $cookies->{ $COOKIE_NAME }; - my $response = ''; - - if ( $received_cookie and $received_cookie eq $COOKIE_VALUE ) - { - - TRACE "Cookie was set previously, showing page"; - - $response .= "HTTP/1.0 200 OK\r\n"; - $response .= "$DEFAULT_HEADER\r\n"; - $response .= "\r\n"; - $response .= $TEST_CONTENT; - - } - else - { - - TRACE "Setting cookie, redirecting to /check_cookie"; - - $response .= "HTTP/1.0 302 Moved Temporarily\r\n"; - $response .= "$DEFAULT_HEADER\r\n"; - $response .= "Location: /check_cookie\r\n"; - $response .= "Set-Cookie: $COOKIE_NAME=$COOKIE_VALUE\r\n"; - $response .= "\r\n"; - $response .= "Redirecting to the cookie check page..."; - } - - return $response; - } - }, - - '/check_cookie' => { - callback => sub { - - my ( $request ) = @_; - - my $cookies = $request->cookies(); - - my $received_cookie = $cookies->{ $COOKIE_NAME }; - my $response = ''; - - if ( $received_cookie and $received_cookie eq $COOKIE_VALUE ) - { - - TRACE "Cookie was set previously, redirecting back to the initial page"; - - $response .= "HTTP/1.0 302 Moved Temporarily\r\n"; - $response .= "$DEFAULT_HEADER\r\n"; - $response .= "Location: $starting_url\r\n"; - $response .= "\r\n"; - $response .= "Cookie looks fine, redirecting you back to the article..."; - - } - else - { - - TRACE "Cookie wasn't found, redirecting you to the /no_cookies page..."; - - $response .= "HTTP/1.0 302 Moved Temporarily\r\n"; - $response .= "$DEFAULT_HEADER\r\n"; - $response .= "Location: /no_cookies\r\n"; - $response .= "\r\n"; - $response .= 'Cookie wasn\'t found, redirecting you to the "no cookies" page...'; - } - - return $response; - } - }, - '/no_cookies' => "No cookie support, go away, we don\'t like you." - }; - - my $hs = MediaWords::Test::HashServer->new( $TEST_HTTP_SERVER_PORT, $pages ); - $hs->start(); - - my $ua = MediaWords::Util::Web::UserAgent->new(); - my $response = $ua->get_follow_http_html_redirects( $starting_url ); - - $hs->stop(); - - is_urls( $response->request()->url(), $starting_url, 'URL after HTTP redirects (cookie)' ); - is( $response->decoded_content(), $TEST_CONTENT, 'Data after HTTP redirects (cookie)' ); -} - -sub test_get_follow_http_html_redirects_previous_responses() -{ - # HTTP redirect - sub _page_http_redirect($) - { - my $page = shift; - - return { - callback => sub { - my ( $request ) = @_; - - my $response = ''; - $response .= "HTTP/1.0 302 Moved Temporarily\r\n"; - $response .= "Content-Type: text/plain; charset=UTF-8\r\n"; - $response .= "Location: $page\r\n"; - $response .= "\r\n"; - $response .= "Redirect to $page..."; - - return $response; - } - }; - } - - # redirect - sub _page_html_redirect($) - { - my $page = shift; - - return ""; - } - - # Various types of redirects mixed together to test setting previous() - my $pages = { - - '/page_1' => _page_http_redirect( '/page_2' ), - - '/page_2' => _page_html_redirect( '/page_3' ), - - '/page_3' => _page_http_redirect( '/page_4' ), - '/page_4' => _page_http_redirect( '/page_5' ), - - '/page_5' => _page_html_redirect( '/page_6' ), - '/page_6' => _page_html_redirect( '/page_7' ), - - # Final page - '/page_7' => 'Finally!', - - }; - - Readonly my $TEST_HTTP_SERVER_URL => 'http://localhost:' . $TEST_HTTP_SERVER_PORT; - my $starting_url = $TEST_HTTP_SERVER_URL . '/page_1'; - - my $hs = MediaWords::Test::HashServer->new( $TEST_HTTP_SERVER_PORT, $pages ); - $hs->start(); - - my $ua = MediaWords::Util::Web::UserAgent->new(); - my $response = $ua->get_follow_http_html_redirects( $starting_url ); - - $hs->stop(); - - ok( $response->is_success() ); - is( $response->decoded_content(), 'Finally!' ); - is_urls( $response->request()->url(), "$TEST_HTTP_SERVER_URL/page_7" ); - - # Test original_request() - ok( $response->original_request() ); - is_urls( $response->original_request()->url(), "$TEST_HTTP_SERVER_URL/page_1" ); - - # Test previous() - $response = $response->previous(); - ok( $response ); - ok( $response->request() ); - is_urls( $response->request()->url(), "$TEST_HTTP_SERVER_URL/page_6" ); - - $response = $response->previous(); - ok( $response ); - ok( $response->request() ); - is_urls( $response->request()->url(), "$TEST_HTTP_SERVER_URL/page_5" ); - - $response = $response->previous(); - ok( $response ); - ok( $response->request() ); - is_urls( $response->request()->url(), "$TEST_HTTP_SERVER_URL/page_4" ); - - $response = $response->previous(); - ok( $response ); - ok( $response->request() ); - is_urls( $response->request()->url(), "$TEST_HTTP_SERVER_URL/page_3" ); - - $response = $response->previous(); - ok( $response ); - ok( $response->request() ); - is_urls( $response->request()->url(), "$TEST_HTTP_SERVER_URL/page_2" ); - - $response = $response->previous(); - ok( $response ); - ok( $response->request() ); - is_urls( $response->request()->url(), "$TEST_HTTP_SERVER_URL/page_1" ); - - ok( !$response->previous() ); -} - -sub test_parallel_get() -{ - my $pages = { - - # Test UTF-8 while we're at it - '/a' => '𝘛𝘩𝘪𝘴 𝘪𝘴 𝘱𝘢𝘨𝘦 𝘈.', # - '/b' => '𝕿𝖍𝖎𝖘 𝖎𝖘 𝖕𝖆𝖌𝖊 𝕭.', # - '/c' => '𝕋𝕙𝕚𝕤 𝕚𝕤 𝕡𝕒𝕘𝕖 ℂ.', # - '/timeout' => { - callback => sub { - my ( $request ) = @_; - - my $response = ''; - - $response .= "HTTP/1.0 200 OK\r\n"; - $response .= "Content-Type: text/html; charset=UTF-8\r\n"; - $response .= "\r\n"; - $response .= "And now we wait"; - - sleep( 10 ); - - return $response; - } - }, - }; - - { - package TimeoutFasterUserAgentConfig; - - use strict; - use warnings; - - use base 'MediaWords::Util::Config::Common::UserAgent'; - - sub parallel_get_timeout() - { - return 2; # time out faster - } - - 1; - } - - my $default_ua_config = MediaWords::Util::Config::Common::user_agent(); - my $timeout_faster_ua_config = TimeoutFasterUserAgentConfig->new( $default_ua_config ); - my $ua = MediaWords::Util::Web::UserAgent->new( $timeout_faster_ua_config ); - - my $base_url = 'http://localhost:' . $TEST_HTTP_SERVER_PORT; - my $urls = [ - "$base_url/a", - "$base_url/b", - "$base_url/c", - "$base_url/timeout", # times out - "$base_url/does-not-exist", # does not exist - ]; - - my $hs = MediaWords::Test::HashServer->new( $TEST_HTTP_SERVER_PORT, $pages ); - $hs->start(); - - my $responses = $ua->parallel_get( $urls ); - - $hs->stop(); - - ok( $responses ); - ok( scalar( @{ $responses } ) == scalar( @{ $urls } ) ); - - my $path_responses = {}; - foreach my $response ( @{ $responses } ) - { - my $path = URI->new( $response->request->url )->path; - $path_responses->{ $path } = $response; - } - - ok( $path_responses->{ '/a' } ); - ok( $path_responses->{ '/a' }->is_success ); - is( $path_responses->{ '/a' }->decoded_content, $pages->{ '/a' } ); - - ok( $path_responses->{ '/b' } ); - ok( $path_responses->{ '/b' }->is_success ); - is( $path_responses->{ '/b' }->decoded_content, $pages->{ '/b' } ); - - ok( $path_responses->{ '/c' } ); - ok( $path_responses->{ '/c' }->is_success ); - is( $path_responses->{ '/c' }->decoded_content, $pages->{ '/c' } ); - - ok( $path_responses->{ '/does-not-exist' } ); - ok( !$path_responses->{ '/does-not-exist' }->is_success ); - is( $path_responses->{ '/does-not-exist' }->code, 404 ); - - ok( $path_responses->{ '/timeout' } ); - ok( !$path_responses->{ '/timeout' }->is_success ); - is( $path_responses->{ '/timeout' }->code, 408 ); -} - -sub test_determined_retries() -{ - # We'll use temporary file for inter-process communication because callback - # will be run in a separate fork so won't be able to modify variable on - # main process - my ( $fh, $request_count_filename ) = tempfile(); - close( $fh ); - - write_file( $request_count_filename, '0' ); - - my $pages = { - - # Page that doesn't work the first two times - '/temporarily-buggy-page' => { - callback => sub { - my ( $request ) = @_; - - my $response = ''; - - my $temporarily_buggy_page_request_count = int( read_file( $request_count_filename ) ); - ++$temporarily_buggy_page_request_count; - write_file( $request_count_filename, $temporarily_buggy_page_request_count ); - - if ( $temporarily_buggy_page_request_count < 3 ) - { - - say STDERR "Simulating failure for $temporarily_buggy_page_request_count time..."; - $response .= "HTTP/1.0 500 Internal Server Error\r\n"; - $response .= "Content-Type: text/plain\r\n"; - $response .= "\r\n"; - $response .= "something's wrong"; - - } - else - { - - say STDERR "Returning successful request..."; - $response .= "HTTP/1.0 200 OK\r\n"; - $response .= "Content-Type: text/plain\r\n"; - $response .= "\r\n"; - $response .= "success on request $temporarily_buggy_page_request_count"; - } - - return $response; - - } - }, - - # Page that doesn't work at all - '/permanently-buggy-page' => { - callback => sub { - my ( $request ) = @_; - - my $response = ''; - $response .= "HTTP/1.0 500 Internal Server Error\r\n"; - $response .= "Content-Type: text/plain\r\n"; - $response .= "\r\n"; - $response .= "something's wrong"; - - return $response; - - } - }, - - }; - my $hs = MediaWords::Test::HashServer->new( $TEST_HTTP_SERVER_PORT, $pages ); - - $hs->start(); - - my $ua = MediaWords::Util::Web::UserAgent->new(); - $ua->set_timeout( 2 ); # time-out really fast - - # Try disabling retries - $ua->set_timing( undef ); - is( $ua->timing(), undef ); - - # Reenable timing - $ua->set_timing( [ 1, 2, 4 ] ); - - # For whatever reason we have to assign current timing() value to a - # variable and only then we can cmp_deeply() it - my $timing = $ua->timing(); - cmp_deeply( $timing, [ 1, 2, 4 ] ); - - { - my $response = $ua->get( $TEST_HTTP_SERVER_URL . '/temporarily-buggy-page' ); - ok( $response->is_success, 'Request should ultimately succeed' ); - is( $response->decoded_content, "success on request 3" ); - } - - { - my $response = $ua->get( $TEST_HTTP_SERVER_URL . '/permanently-buggy-page' ); - ok( !$response->is_success, 'Request should fail' ); - } - - $hs->stop(); -} - -sub test_get_string() -{ - my $pages = { '/exists' => 'I do exist.', }; - my $hs = MediaWords::Test::HashServer->new( $TEST_HTTP_SERVER_PORT, $pages ); - $hs->start(); - - my $ua = MediaWords::Util::Web::UserAgent->new(); - - my $exists_string = $ua->get_string( "$TEST_HTTP_SERVER_URL/exists" ); - my $does_not_exist_string = $ua->get_string( "$TEST_HTTP_SERVER_URL/does-not-exist" ); - - $hs->stop(); - - is( $exists_string, 'I do exist.' ); - is( $does_not_exist_string, undef ); -} - -sub test_post() -{ - - sub _parse_query_string($) - { - my $query_string = shift; - - my $uri = URI->new( 'http://test/?' . $query_string ); - my $params = $uri->query_form_hash(); - - my $params_decoded = {}; - foreach my $key ( keys %{ $params } ) - { - my $value = $params->{ $key }; - $params_decoded->{ decode_utf8( $key ) } = decode_utf8( $value ); - } - - return $params_decoded; - } - - # User-Agent: and From: headers - my $pages = { - '/test-post' => { - callback => sub { - my ( $request ) = @_; - - my $response = ''; - - $response .= "HTTP/1.0 200 OK\r\n"; - $response .= "Content-Type: application/json; charset=UTF-8\r\n"; - $response .= "\r\n"; - $response .= MediaWords::Util::ParseJSON::encode_json( - { - 'method' => $request->method(), - 'content-type' => $request->content_type(), - 'content' => _parse_query_string( $request->content() ), - } - ); - - return $response; - } - } - }; - my $hs = MediaWords::Test::HashServer->new( $TEST_HTTP_SERVER_PORT, $pages ); - $hs->start(); - - my $ua = MediaWords::Util::Web::UserAgent->new(); - my $url = "$TEST_HTTP_SERVER_URL/test-post"; - - # UTF-8 string request - { - my $request = MediaWords::Util::Web::UserAgent::Request->new( 'POST', $url ); - $request->set_content_type( 'application/x-www-form-urlencoded; charset=utf-8' ); - $request->set_content( 'ą=č&ė=ž' ); - - my $response = $ua->request( $request ); - - ok( $response->is_success() ); - is_urls( $response->request()->url(), $TEST_HTTP_SERVER_URL . '/test-post' ); - - my $decoded_json = MediaWords::Util::ParseJSON::decode_json( $response->decoded_content() ); - cmp_deeply( - $decoded_json, - { - 'method' => 'POST', - 'content-type' => 'application/x-www-form-urlencoded; charset=utf-8', - 'content' => { - 'ą' => 'č', - 'ė' => 'ž', - }, - } - ); - } - - # UTF-8 hashref request - { - my $request = MediaWords::Util::Web::UserAgent::Request->new( 'POST', $url ); - $request->set_content_type( 'application/x-www-form-urlencoded; charset=utf-8' ); - $request->set_content( - { - 'ą' => 'č', - 'ė' => 'ž', - } - ); - - my $response = $ua->request( $request ); - - ok( $response->is_success() ); - is_urls( $response->request()->url(), $TEST_HTTP_SERVER_URL . '/test-post' ); - - my $decoded_json = MediaWords::Util::ParseJSON::decode_json( $response->decoded_content() ); - cmp_deeply( - $decoded_json, - { - 'method' => 'POST', - 'content-type' => 'application/x-www-form-urlencoded; charset=utf-8', - 'content' => { - 'ą' => 'č', - 'ė' => 'ž', - }, - } - ); - } - - # UTF-8 encoded string request - { - my $request = MediaWords::Util::Web::UserAgent::Request->new( 'POST', $url ); - $request->set_content_type( 'application/x-www-form-urlencoded; charset=utf-8' ); - $request->set_content( encode_utf8( 'ą=č&ė=ž' ) ); - - my $response = $ua->request( $request ); - - ok( $response->is_success() ); - is_urls( $response->request()->url(), $TEST_HTTP_SERVER_URL . '/test-post' ); - - my $decoded_json = MediaWords::Util::ParseJSON::decode_json( $response->decoded_content() ); - cmp_deeply( - $decoded_json, - { - 'method' => 'POST', - 'content-type' => 'application/x-www-form-urlencoded; charset=utf-8', - 'content' => { - 'ą' => 'č', - 'ė' => 'ž', - }, - } - ); - } - - # UTF-8 encoded hashref request - { - my $request = MediaWords::Util::Web::UserAgent::Request->new( 'POST', $url ); - $request->set_content_type( 'application/x-www-form-urlencoded; charset=utf-8' ); - $request->set_content( - { - encode_utf8( 'ą' ) => encode_utf8( 'č' ), - encode_utf8( 'ė' ) => encode_utf8( 'ž' ), - } - ); - - my $response = $ua->request( $request ); - - ok( $response->is_success() ); - is_urls( $response->request()->url(), $TEST_HTTP_SERVER_URL . '/test-post' ); - - my $decoded_json = MediaWords::Util::ParseJSON::decode_json( $response->decoded_content() ); - cmp_deeply( - $decoded_json, - { - 'method' => 'POST', - 'content-type' => 'application/x-www-form-urlencoded; charset=utf-8', - 'content' => { - 'ą' => 'č', - 'ė' => 'ž', - }, - } - ); - } - - $hs->stop(); -} - -sub main() -{ - my $builder = Test::More->builder; - binmode $builder->output, ":utf8"; - binmode $builder->failure_output, ":utf8"; - binmode $builder->todo_output, ":utf8"; - - test_get(); - test_get_user_agent_from_headers(); - test_get_not_found(); - test_get_valid_utf8_content(); - test_get_invalid_utf8_content(); - test_get_non_utf8_content(); - test_get_max_size(); - test_get_max_redirect(); - test_get_request_headers(); - test_get_response_status(); - test_get_response_headers(); - test_get_response_content_type(); - test_get_blacklisted_url(); - test_get_http_auth(); - test_get_authenticated_domains(); - - test_get_follow_http_html_redirects_nonexistent(); - test_get_follow_http_html_redirects_http(); - test_get_follow_http_html_redirects_html(); - test_get_follow_http_html_redirects_http_loop(); - test_get_follow_http_html_redirects_html_loop(); - test_get_follow_http_html_redirects_cookies(); - test_get_follow_http_html_redirects_previous_responses(); - - test_post(); - - test_parallel_get(); - test_determined_retries(); - test_get_string(); -} - -main(); diff --git a/apps/common/tests/perl/MediaWords/Util/Web/UserAgent/HTMLRedirects.t b/apps/common/tests/perl/MediaWords/Util/Web/UserAgent/HTMLRedirects.t deleted file mode 100644 index d1603aa776..0000000000 --- a/apps/common/tests/perl/MediaWords/Util/Web/UserAgent/HTMLRedirects.t +++ /dev/null @@ -1,184 +0,0 @@ -use strict; -use warnings; -use utf8; - -use Test::More tests => 14; -use Test::NoWarnings; - -use Test::Deep; - -use MediaWords::Util::Web::UserAgent::HTMLRedirects; -use MediaWords::Test::URLs; - -sub test_target_request_from_meta_refresh_url() -{ - is_urls( - MediaWords::Util::Web::UserAgent::HTMLRedirects::target_request_from_meta_refresh_url( - < - - This is a test - - - - -

This is a test.

- - -EOF - 'http://example2.com/' - )->url(), - 'http://example.com/', - ' refresh' - ); -} - -sub test_target_request_from_archive_is_url() -{ - is_urls( - MediaWords::Util::Web::UserAgent::HTMLRedirects::target_request_from_archive_is_url( - '', # - 'https://archive.is/20170201/https://bar.com/foo/bar' # - )->url(), - 'https://bar.com/foo/bar', # - 'archive.is' # - ); - - is( - MediaWords::Util::Web::UserAgent::HTMLRedirects::target_request_from_archive_is_url( - '', # - 'https://bar.com/foo/bar' # - ), - undef, # - 'archive.is with non-matching URL' # - ); -} - -sub test_target_request_from_archive_org_url() -{ - is_urls( - MediaWords::Util::Web::UserAgent::HTMLRedirects::target_request_from_archive_org_url( - undef, # - 'https://web.archive.org/web/20150204024130/http://www.john-daly.com/hockey/hockey.htm' # - )->url(), - 'http://www.john-daly.com/hockey/hockey.htm', # - 'archive.org' # - ); - - is( - MediaWords::Util::Web::UserAgent::HTMLRedirects::target_request_from_archive_org_url( - undef, # - 'http://www.john-daly.com/hockey/hockey.htm' # - ), - undef, # - 'archive.org with non-matching URL' # - ); -} - -sub test_target_request_from_linkis_com_url() -{ - is_urls( - MediaWords::Util::Web::UserAgent::HTMLRedirects::target_request_from_linkis_com_url( - 'url(), - 'http://og.url/test', # - 'linkis.com ' # - ); - - is_urls( - MediaWords::Util::Web::UserAgent::HTMLRedirects::target_request_from_linkis_com_url( - 'url(), - 'http://you.tube/test', # - 'linkis.com YouTube' # - ); - - is_urls( - MediaWords::Util::Web::UserAgent::HTMLRedirects::target_request_from_linkis_com_url( - '